Files
OfficeFileHandle/docx2md/AI友好型.py
ChuXun f9ee42815d 1
2026-01-04 15:29:07 +08:00

120 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import subprocess
import os
import re
import shutil
import urllib.parse # 用于处理文件名中的 %20 空格
from pathlib import Path
def convert_final_clean(docx_file):
# --- 配置区域 ---
path_obj = Path(docx_file)
base_name = path_obj.stem
output_md = f"{base_name}_Compact.md"
# 临时文件夹 (乱序,脚本运行完会删除)
temp_media_folder = f"{base_name}_temp_media"
# 最终文件夹 (有序,这是你要的)
final_images_folder = f"{base_name}_Images_Ordered"
# ----------------
print(f"🔄 正在启动 Pandoc 转换...")
# 1. Pandoc 转换 (提取原始图片)
cmd = ["pandoc", docx_file, "-o", output_md,
"--to=gfm",
"--standalone",
f"--extract-media={temp_media_folder}", # 提取到临时目录
"--wrap=none"]
try:
subprocess.run(cmd, check=True, capture_output=True)
except Exception as e:
print(f"❌ Pandoc 转换失败: {e}")
return
# 准备最终文件夹
if os.path.exists(final_images_folder):
shutil.rmtree(final_images_folder) # 如果旧的存在,先清空
os.makedirs(final_images_folder)
with open(output_md, 'r', encoding='utf-8') as f:
content = f.read()
print(f"🔄 正在整理图片并重命名...")
# 2. 图片清洗与重命名逻辑
img_count = 0
def img_replacer(m):
nonlocal img_count
# 获取图片路径 (兼容 Markdown ![]() 和 HTML <img src>)
# group(2) 是 Markdown 路径group(4) 是 HTML src 路径
raw_path = m.group(2) or m.group(4) or ""
if not raw_path: return m.group(0)
# 解码路径 (比如把 "media/image%201.png" 转为 "media/image 1.png")
original_path = urllib.parse.unquote(raw_path)
# 修正 Windows 反斜杠问题
original_path = original_path.replace("\\", "/")
# 构造新文件名 (Image_001.png)
ext = os.path.splitext(original_path)[1]
if not ext: ext = ".png" # 防止无后缀
new_filename = f"Image_{img_count+1:03d}{ext}" # 序号从 001 开始
# 执行文件复制
# 注意Pandoc 提取的路径通常是 "folder/media/image.png"
# 我们需要找到这个文件在磁盘上的真实位置
src_full_path = os.path.join(os.getcwd(), original_path)
dest_full_path = os.path.join(final_images_folder, new_filename)
if os.path.exists(src_full_path):
shutil.copy2(src_full_path, dest_full_path)
img_count += 1 # 只有文件存在且复制成功,计数器才+1
# 生成 AI 上传指令 (Markdown)
return (
f"\n\n---\n"
f"> 📸 **[指令:请在此处上传图片 {img_count:03d}]**\n"
f"> 文件名: `{new_filename}`\n"
f"---\n\n"
)
else:
# 如果找不到原图,保留原样或提示错误
print(f"⚠️ 警告: 找不到图片文件 {src_full_path}")
return m.group(0)
# 正则:同时匹配 Markdown图片 和 HTML图片
pattern = r'(!\[.*?\]\((.*?)\))|(<img[^>]+src=["\'](.*?)["\'][^>]*>)'
content = re.sub(pattern, img_replacer, content, flags=re.IGNORECASE)
# 3. 表格与文本清洗
content = re.sub(r' +', ' ', content) # 压缩空格
content = re.sub(r'\{width=.*?\}', '', content) # 去除 pandoc 尺寸标记
content = re.sub(r'\n{3,}', '\n\n', content) # 去除过多空行
# 4. 保存 Markdown
with open(output_md, 'w', encoding='utf-8') as f:
f.write(content)
# 5. 【关键步骤】删除那个乱七八糟的临时文件夹
if os.path.exists(temp_media_folder):
try:
shutil.rmtree(temp_media_folder)
print(f"🗑️ 已删除临时文件夹: {temp_media_folder}")
except:
pass
print(f"\n" + "="*50)
print(f"✅ 处理完成!")
print(f"📂 请务必打开这个文件夹找图片: 【 {final_images_folder}")
print(f"📄 Markdown 文件: {output_md}")
print(f"🔢 共处理图片: {img_count}")
print(f"="*50)
if __name__ == "__main__":
# 请确保这里的文件名是你真实的 docx 文件名
convert_final_clean("1.docx")