Files
Docx/docx2md/docx2md.py
ChuXun 8d4419b1a0 1
2025-12-29 03:11:16 +08:00

103 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import subprocess
import os
import re
import json
from pathlib import Path
def convert_compact_mode(docx_file):
path_obj = Path(docx_file)
base_name = path_obj.stem
output_md = f"{base_name}_Compact.md"
media_folder = f"{base_name}_images"
metadata_file = f"{base_name}_metadata.json"
images_list_file = f"{base_name}_images.txt"
# 1. 关键修改:使用 gfm 格式,强制生成紧凑表格
cmd = ["pandoc", docx_file, "-o", output_md,
"--to=gfm", # <--- 核心改动:使用 GitHub 风格,拒绝网格表
"--standalone",
f"--extract-media={media_folder}",
"--wrap=none",
"--markdown-headings=atx"]
try:
subprocess.run(cmd, check=True, capture_output=True)
except Exception as e:
print(f"❌ 转换出错: {e}")
return
with open(output_md, 'r', encoding='utf-8') as f:
content = f.read()
# 2. 紧凑版正则清洗
# 因为 gfm 主要是 pipe table我们只需要匹配 |...| 结构
# 并且不仅匹配,还要把里面为了对齐而产生的多余空格给压扁
# 这一步将 "| 文本 |" 压缩为 "| 文本 |"
def compress_table_row(match):
row = match.group(0)
# 将连续的空格替换为单个空格,但保留结构
return re.sub(r' +', ' ', row)
# 匹配标准 Markdown 表格行
content = re.sub(r'^\|.*\|$', compress_table_row, content, flags=re.MULTILINE)
# 3. 标记表格 (逻辑简化,因为现在只有一种表格格式了)
tbl_count = 0
def tag_table(match):
nonlocal tbl_count
tbl_count += 1
return f"\n\n**[表格 {tbl_count}]**\n{match.group(0)}\n"
# 匹配连续的表格块
table_block_regex = r'(\|.*\|[\r\n]+)+(\|[\s\-:|]+\|[\r\n]+)(\|.*\|[\r\n]*)+'
content = re.sub(table_block_regex, tag_table, content)
# 4. 图片处理与清单 (保持不变)
img_count = 0
img_details = []
def img_replacer(m):
nonlocal img_count
img_count += 1
path = m.group(2) or m.group(3) or ""
filename = os.path.basename(path)
img_details.append(filename)
return f"\n\n**[图片 {img_count}: {filename}]**\n{m.group(0)}\n"
content = re.sub(r'(!\[.*?\]\((.*?)\)|\[.*?\]:\s*([^\s]+\.(?:png|jpg|jpeg|gif|bmp))(\s*\{.*?\})?)', img_replacer, content)
# 5. 深度清洗噪声
content = re.sub(r'\[(.*?)\]\(\\l\)', r'\1', content)
content = re.sub(r'\[(.*?)\]\[\d+\]', r'\1', content)
content = re.sub(r'\{width=.*?\}', '', content)
content = re.sub(r'\n{3,}', '\n\n', content)
# 6. 保存元信息与清单
with open(images_list_file, 'w', encoding='utf-8') as f:
f.write(f"文档 {base_name} 图片清单:\n" + "="*30 + "\n")
for i, name in enumerate(img_details, 1):
f.write(f"{i}. {name}\n")
stats = {
"filename": base_name,
"table_count": tbl_count,
"image_count": img_count,
"char_count": len(content),
# 重新预估 Token因为去掉了空格这个值会更准
"estimated_tokens": int(len(content) * 0.6)
}
with open(metadata_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=4, ensure_ascii=False)
with open(output_md, 'w', encoding='utf-8') as f:
f.write(content)
print(f"\n" + "="*60)
print(f"✅ 紧凑版转换完成!已去除冗余空格")
print(f"📁 文档: {output_md}")
print(f"📊 统计: {img_count} 图片, {tbl_count} 表格")
print(f"📉 Token 节省: 预估比原版节省 30% 空间")
print("="*60)
if __name__ == "__main__":
convert_compact_mode("详细设计说明书(最终版).docx")