mirror of
https://github.com/ChuXunYu/OfficeFileHandle.git
synced 2026-01-31 10:11:25 +00:00
1
This commit is contained in:
120
docx2md/AI友好型.py
Normal file
120
docx2md/AI友好型.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import subprocess
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import urllib.parse # 用于处理文件名中的 %20 空格
|
||||
from pathlib import Path
|
||||
|
||||
def convert_final_clean(docx_file):
|
||||
# --- 配置区域 ---
|
||||
path_obj = Path(docx_file)
|
||||
base_name = path_obj.stem
|
||||
output_md = f"{base_name}_Compact.md"
|
||||
|
||||
# 临时文件夹 (乱序,脚本运行完会删除)
|
||||
temp_media_folder = f"{base_name}_temp_media"
|
||||
# 最终文件夹 (有序,这是你要的)
|
||||
final_images_folder = f"{base_name}_Images_Ordered"
|
||||
# ----------------
|
||||
|
||||
print(f"🔄 正在启动 Pandoc 转换...")
|
||||
|
||||
# 1. Pandoc 转换 (提取原始图片)
|
||||
cmd = ["pandoc", docx_file, "-o", output_md,
|
||||
"--to=gfm",
|
||||
"--standalone",
|
||||
f"--extract-media={temp_media_folder}", # 提取到临时目录
|
||||
"--wrap=none"]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
except Exception as e:
|
||||
print(f"❌ Pandoc 转换失败: {e}")
|
||||
return
|
||||
|
||||
# 准备最终文件夹
|
||||
if os.path.exists(final_images_folder):
|
||||
shutil.rmtree(final_images_folder) # 如果旧的存在,先清空
|
||||
os.makedirs(final_images_folder)
|
||||
|
||||
with open(output_md, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
print(f"🔄 正在整理图片并重命名...")
|
||||
|
||||
# 2. 图片清洗与重命名逻辑
|
||||
img_count = 0
|
||||
|
||||
def img_replacer(m):
|
||||
nonlocal img_count
|
||||
|
||||
# 获取图片路径 (兼容 Markdown ![]() 和 HTML <img src>)
|
||||
# group(2) 是 Markdown 路径,group(4) 是 HTML src 路径
|
||||
raw_path = m.group(2) or m.group(4) or ""
|
||||
|
||||
if not raw_path: return m.group(0)
|
||||
|
||||
# 解码路径 (比如把 "media/image%201.png" 转为 "media/image 1.png")
|
||||
original_path = urllib.parse.unquote(raw_path)
|
||||
|
||||
# 修正 Windows 反斜杠问题
|
||||
original_path = original_path.replace("\\", "/")
|
||||
|
||||
# 构造新文件名 (Image_001.png)
|
||||
ext = os.path.splitext(original_path)[1]
|
||||
if not ext: ext = ".png" # 防止无后缀
|
||||
new_filename = f"Image_{img_count+1:03d}{ext}" # 序号从 001 开始
|
||||
|
||||
# 执行文件复制
|
||||
# 注意:Pandoc 提取的路径通常是 "folder/media/image.png"
|
||||
# 我们需要找到这个文件在磁盘上的真实位置
|
||||
src_full_path = os.path.join(os.getcwd(), original_path)
|
||||
dest_full_path = os.path.join(final_images_folder, new_filename)
|
||||
|
||||
if os.path.exists(src_full_path):
|
||||
shutil.copy2(src_full_path, dest_full_path)
|
||||
img_count += 1 # 只有文件存在且复制成功,计数器才+1
|
||||
|
||||
# 生成 AI 上传指令 (Markdown)
|
||||
return (
|
||||
f"\n\n---\n"
|
||||
f"> 📸 **[指令:请在此处上传图片 {img_count:03d}]**\n"
|
||||
f"> 文件名: `{new_filename}`\n"
|
||||
f"---\n\n"
|
||||
)
|
||||
else:
|
||||
# 如果找不到原图,保留原样或提示错误
|
||||
print(f"⚠️ 警告: 找不到图片文件 {src_full_path}")
|
||||
return m.group(0)
|
||||
|
||||
# 正则:同时匹配 Markdown图片 和 HTML图片
|
||||
pattern = r'(!\[.*?\]\((.*?)\))|(<img[^>]+src=["\'](.*?)["\'][^>]*>)'
|
||||
content = re.sub(pattern, img_replacer, content, flags=re.IGNORECASE)
|
||||
|
||||
# 3. 表格与文本清洗
|
||||
content = re.sub(r' +', ' ', content) # 压缩空格
|
||||
content = re.sub(r'\{width=.*?\}', '', content) # 去除 pandoc 尺寸标记
|
||||
content = re.sub(r'\n{3,}', '\n\n', content) # 去除过多空行
|
||||
|
||||
# 4. 保存 Markdown
|
||||
with open(output_md, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
# 5. 【关键步骤】删除那个乱七八糟的临时文件夹
|
||||
if os.path.exists(temp_media_folder):
|
||||
try:
|
||||
shutil.rmtree(temp_media_folder)
|
||||
print(f"🗑️ 已删除临时文件夹: {temp_media_folder}")
|
||||
except:
|
||||
pass
|
||||
|
||||
print(f"\n" + "="*50)
|
||||
print(f"✅ 处理完成!")
|
||||
print(f"📂 请务必打开这个文件夹找图片: 【 {final_images_folder} 】")
|
||||
print(f"📄 Markdown 文件: {output_md}")
|
||||
print(f"🔢 共处理图片: {img_count} 张")
|
||||
print(f"="*50)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 请确保这里的文件名是你真实的 docx 文件名
|
||||
convert_final_clean("1.docx")
|
||||
Reference in New Issue
Block a user