mirror of
https://github.com/ChuXunYu/OfficeFileHandle.git
synced 2026-01-31 00:51:26 +00:00
120 lines
4.3 KiB
Python
120 lines
4.3 KiB
Python
import subprocess
|
||
import os
|
||
import re
|
||
import shutil
|
||
import urllib.parse # 用于处理文件名中的 %20 空格
|
||
from pathlib import Path
|
||
|
||
def convert_final_clean(docx_file):
|
||
# --- 配置区域 ---
|
||
path_obj = Path(docx_file)
|
||
base_name = path_obj.stem
|
||
output_md = f"{base_name}_Compact.md"
|
||
|
||
# 临时文件夹 (乱序,脚本运行完会删除)
|
||
temp_media_folder = f"{base_name}_temp_media"
|
||
# 最终文件夹 (有序,这是你要的)
|
||
final_images_folder = f"{base_name}_Images_Ordered"
|
||
# ----------------
|
||
|
||
print(f"🔄 正在启动 Pandoc 转换...")
|
||
|
||
# 1. Pandoc 转换 (提取原始图片)
|
||
cmd = ["pandoc", docx_file, "-o", output_md,
|
||
"--to=gfm",
|
||
"--standalone",
|
||
f"--extract-media={temp_media_folder}", # 提取到临时目录
|
||
"--wrap=none"]
|
||
|
||
try:
|
||
subprocess.run(cmd, check=True, capture_output=True)
|
||
except Exception as e:
|
||
print(f"❌ Pandoc 转换失败: {e}")
|
||
return
|
||
|
||
# 准备最终文件夹
|
||
if os.path.exists(final_images_folder):
|
||
shutil.rmtree(final_images_folder) # 如果旧的存在,先清空
|
||
os.makedirs(final_images_folder)
|
||
|
||
with open(output_md, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
print(f"🔄 正在整理图片并重命名...")
|
||
|
||
# 2. 图片清洗与重命名逻辑
|
||
img_count = 0
|
||
|
||
def img_replacer(m):
|
||
nonlocal img_count
|
||
|
||
# 获取图片路径 (兼容 Markdown ![]() 和 HTML <img src>)
|
||
# group(2) 是 Markdown 路径,group(4) 是 HTML src 路径
|
||
raw_path = m.group(2) or m.group(4) or ""
|
||
|
||
if not raw_path: return m.group(0)
|
||
|
||
# 解码路径 (比如把 "media/image%201.png" 转为 "media/image 1.png")
|
||
original_path = urllib.parse.unquote(raw_path)
|
||
|
||
# 修正 Windows 反斜杠问题
|
||
original_path = original_path.replace("\\", "/")
|
||
|
||
# 构造新文件名 (Image_001.png)
|
||
ext = os.path.splitext(original_path)[1]
|
||
if not ext: ext = ".png" # 防止无后缀
|
||
new_filename = f"Image_{img_count+1:03d}{ext}" # 序号从 001 开始
|
||
|
||
# 执行文件复制
|
||
# 注意:Pandoc 提取的路径通常是 "folder/media/image.png"
|
||
# 我们需要找到这个文件在磁盘上的真实位置
|
||
src_full_path = os.path.join(os.getcwd(), original_path)
|
||
dest_full_path = os.path.join(final_images_folder, new_filename)
|
||
|
||
if os.path.exists(src_full_path):
|
||
shutil.copy2(src_full_path, dest_full_path)
|
||
img_count += 1 # 只有文件存在且复制成功,计数器才+1
|
||
|
||
# 生成 AI 上传指令 (Markdown)
|
||
return (
|
||
f"\n\n---\n"
|
||
f"> 📸 **[指令:请在此处上传图片 {img_count:03d}]**\n"
|
||
f"> 文件名: `{new_filename}`\n"
|
||
f"---\n\n"
|
||
)
|
||
else:
|
||
# 如果找不到原图,保留原样或提示错误
|
||
print(f"⚠️ 警告: 找不到图片文件 {src_full_path}")
|
||
return m.group(0)
|
||
|
||
# 正则:同时匹配 Markdown图片 和 HTML图片
|
||
pattern = r'(!\[.*?\]\((.*?)\))|(<img[^>]+src=["\'](.*?)["\'][^>]*>)'
|
||
content = re.sub(pattern, img_replacer, content, flags=re.IGNORECASE)
|
||
|
||
# 3. 表格与文本清洗
|
||
content = re.sub(r' +', ' ', content) # 压缩空格
|
||
content = re.sub(r'\{width=.*?\}', '', content) # 去除 pandoc 尺寸标记
|
||
content = re.sub(r'\n{3,}', '\n\n', content) # 去除过多空行
|
||
|
||
# 4. 保存 Markdown
|
||
with open(output_md, 'w', encoding='utf-8') as f:
|
||
f.write(content)
|
||
|
||
# 5. 【关键步骤】删除那个乱七八糟的临时文件夹
|
||
if os.path.exists(temp_media_folder):
|
||
try:
|
||
shutil.rmtree(temp_media_folder)
|
||
print(f"🗑️ 已删除临时文件夹: {temp_media_folder}")
|
||
except:
|
||
pass
|
||
|
||
print(f"\n" + "="*50)
|
||
print(f"✅ 处理完成!")
|
||
print(f"📂 请务必打开这个文件夹找图片: 【 {final_images_folder} 】")
|
||
print(f"📄 Markdown 文件: {output_md}")
|
||
print(f"🔢 共处理图片: {img_count} 张")
|
||
print(f"="*50)
|
||
|
||
if __name__ == "__main__":
|
||
# 请确保这里的文件名是你真实的 docx 文件名
|
||
convert_final_clean("1.docx") |