This commit is contained in:
ChuXun
2026-01-04 15:29:07 +08:00
parent 8d4419b1a0
commit f9ee42815d
28 changed files with 1095 additions and 1 deletions

120
docx2md/AI友好型.py Normal file
View File

@@ -0,0 +1,120 @@
import subprocess
import os
import re
import shutil
import urllib.parse # 用于处理文件名中的 %20 空格
from pathlib import Path
def convert_final_clean(docx_file):
# --- 配置区域 ---
path_obj = Path(docx_file)
base_name = path_obj.stem
output_md = f"{base_name}_Compact.md"
# 临时文件夹 (乱序,脚本运行完会删除)
temp_media_folder = f"{base_name}_temp_media"
# 最终文件夹 (有序,这是你要的)
final_images_folder = f"{base_name}_Images_Ordered"
# ----------------
print(f"🔄 正在启动 Pandoc 转换...")
# 1. Pandoc 转换 (提取原始图片)
cmd = ["pandoc", docx_file, "-o", output_md,
"--to=gfm",
"--standalone",
f"--extract-media={temp_media_folder}", # 提取到临时目录
"--wrap=none"]
try:
subprocess.run(cmd, check=True, capture_output=True)
except Exception as e:
print(f"❌ Pandoc 转换失败: {e}")
return
# 准备最终文件夹
if os.path.exists(final_images_folder):
shutil.rmtree(final_images_folder) # 如果旧的存在,先清空
os.makedirs(final_images_folder)
with open(output_md, 'r', encoding='utf-8') as f:
content = f.read()
print(f"🔄 正在整理图片并重命名...")
# 2. 图片清洗与重命名逻辑
img_count = 0
def img_replacer(m):
nonlocal img_count
# 获取图片路径 (兼容 Markdown ![]() 和 HTML <img src>)
# group(2) 是 Markdown 路径group(4) 是 HTML src 路径
raw_path = m.group(2) or m.group(4) or ""
if not raw_path: return m.group(0)
# 解码路径 (比如把 "media/image%201.png" 转为 "media/image 1.png")
original_path = urllib.parse.unquote(raw_path)
# 修正 Windows 反斜杠问题
original_path = original_path.replace("\\", "/")
# 构造新文件名 (Image_001.png)
ext = os.path.splitext(original_path)[1]
if not ext: ext = ".png" # 防止无后缀
new_filename = f"Image_{img_count+1:03d}{ext}" # 序号从 001 开始
# 执行文件复制
# 注意Pandoc 提取的路径通常是 "folder/media/image.png"
# 我们需要找到这个文件在磁盘上的真实位置
src_full_path = os.path.join(os.getcwd(), original_path)
dest_full_path = os.path.join(final_images_folder, new_filename)
if os.path.exists(src_full_path):
shutil.copy2(src_full_path, dest_full_path)
img_count += 1 # 只有文件存在且复制成功,计数器才+1
# 生成 AI 上传指令 (Markdown)
return (
f"\n\n---\n"
f"> 📸 **[指令:请在此处上传图片 {img_count:03d}]**\n"
f"> 文件名: `{new_filename}`\n"
f"---\n\n"
)
else:
# 如果找不到原图,保留原样或提示错误
print(f"⚠️ 警告: 找不到图片文件 {src_full_path}")
return m.group(0)
# 正则:同时匹配 Markdown图片 和 HTML图片
pattern = r'(!\[.*?\]\((.*?)\))|(<img[^>]+src=["\'](.*?)["\'][^>]*>)'
content = re.sub(pattern, img_replacer, content, flags=re.IGNORECASE)
# 3. 表格与文本清洗
content = re.sub(r' +', ' ', content) # 压缩空格
content = re.sub(r'\{width=.*?\}', '', content) # 去除 pandoc 尺寸标记
content = re.sub(r'\n{3,}', '\n\n', content) # 去除过多空行
# 4. 保存 Markdown
with open(output_md, 'w', encoding='utf-8') as f:
f.write(content)
# 5. 【关键步骤】删除那个乱七八糟的临时文件夹
if os.path.exists(temp_media_folder):
try:
shutil.rmtree(temp_media_folder)
print(f"🗑️ 已删除临时文件夹: {temp_media_folder}")
except:
pass
print(f"\n" + "="*50)
print(f"✅ 处理完成!")
print(f"📂 请务必打开这个文件夹找图片: 【 {final_images_folder}")
print(f"📄 Markdown 文件: {output_md}")
print(f"🔢 共处理图片: {img_count}")
print(f"="*50)
if __name__ == "__main__":
# 请确保这里的文件名是你真实的 docx 文件名
convert_final_clean("1.docx")