import subprocess import os import re import shutil import urllib.parse # 用于处理文件名中的 %20 空格 from pathlib import Path def convert_final_clean(docx_file): # --- 配置区域 --- path_obj = Path(docx_file) base_name = path_obj.stem output_md = f"{base_name}_Compact.md" # 临时文件夹 (乱序,脚本运行完会删除) temp_media_folder = f"{base_name}_temp_media" # 最终文件夹 (有序,这是你要的) final_images_folder = f"{base_name}_Images_Ordered" # ---------------- print(f"🔄 正在启动 Pandoc 转换...") # 1. Pandoc 转换 (提取原始图片) cmd = ["pandoc", docx_file, "-o", output_md, "--to=gfm", "--standalone", f"--extract-media={temp_media_folder}", # 提取到临时目录 "--wrap=none"] try: subprocess.run(cmd, check=True, capture_output=True) except Exception as e: print(f"❌ Pandoc 转换失败: {e}") return # 准备最终文件夹 if os.path.exists(final_images_folder): shutil.rmtree(final_images_folder) # 如果旧的存在,先清空 os.makedirs(final_images_folder) with open(output_md, 'r', encoding='utf-8') as f: content = f.read() print(f"🔄 正在整理图片并重命名...") # 2. 图片清洗与重命名逻辑 img_count = 0 def img_replacer(m): nonlocal img_count # 获取图片路径 (兼容 Markdown ![]() 和 HTML ) # group(2) 是 Markdown 路径,group(4) 是 HTML src 路径 raw_path = m.group(2) or m.group(4) or "" if not raw_path: return m.group(0) # 解码路径 (比如把 "media/image%201.png" 转为 "media/image 1.png") original_path = urllib.parse.unquote(raw_path) # 修正 Windows 反斜杠问题 original_path = original_path.replace("\\", "/") # 构造新文件名 (Image_001.png) ext = os.path.splitext(original_path)[1] if not ext: ext = ".png" # 防止无后缀 new_filename = f"Image_{img_count+1:03d}{ext}" # 序号从 001 开始 # 执行文件复制 # 注意:Pandoc 提取的路径通常是 "folder/media/image.png" # 我们需要找到这个文件在磁盘上的真实位置 src_full_path = os.path.join(os.getcwd(), original_path) dest_full_path = os.path.join(final_images_folder, new_filename) if os.path.exists(src_full_path): shutil.copy2(src_full_path, dest_full_path) img_count += 1 # 只有文件存在且复制成功,计数器才+1 # 生成 AI 上传指令 (Markdown) return ( f"\n\n---\n" f"> 📸 **[指令:请在此处上传图片 {img_count:03d}]**\n" f"> 文件名: `{new_filename}`\n" f"---\n\n" ) else: # 如果找不到原图,保留原样或提示错误 print(f"⚠️ 警告: 找不到图片文件 {src_full_path}") return m.group(0) # 正则:同时匹配 Markdown图片 和 HTML图片 pattern = r'(!\[.*?\]\((.*?)\))|(]+src=["\'](.*?)["\'][^>]*>)' content = re.sub(pattern, img_replacer, content, flags=re.IGNORECASE) # 3. 表格与文本清洗 content = re.sub(r' +', ' ', content) # 压缩空格 content = re.sub(r'\{width=.*?\}', '', content) # 去除 pandoc 尺寸标记 content = re.sub(r'\n{3,}', '\n\n', content) # 去除过多空行 # 4. 保存 Markdown with open(output_md, 'w', encoding='utf-8') as f: f.write(content) # 5. 【关键步骤】删除那个乱七八糟的临时文件夹 if os.path.exists(temp_media_folder): try: shutil.rmtree(temp_media_folder) print(f"🗑️ 已删除临时文件夹: {temp_media_folder}") except: pass print(f"\n" + "="*50) print(f"✅ 处理完成!") print(f"📂 请务必打开这个文件夹找图片: 【 {final_images_folder} 】") print(f"📄 Markdown 文件: {output_md}") print(f"🔢 共处理图片: {img_count} 张") print(f"="*50) if __name__ == "__main__": # 请确保这里的文件名是你真实的 docx 文件名 convert_final_clean("1.docx")