mirror of
https://github.com/ChuXunYu/OfficeFileHandle.git
synced 2026-01-31 10:11:25 +00:00
103 lines
3.6 KiB
Python
103 lines
3.6 KiB
Python
import subprocess
|
||
import os
|
||
import re
|
||
import json
|
||
from pathlib import Path
|
||
|
||
def convert_compact_mode(docx_file):
|
||
path_obj = Path(docx_file)
|
||
base_name = path_obj.stem
|
||
output_md = f"{base_name}_Compact.md"
|
||
media_folder = f"{base_name}_images"
|
||
metadata_file = f"{base_name}_metadata.json"
|
||
images_list_file = f"{base_name}_images.txt"
|
||
|
||
# 1. 关键修改:使用 gfm 格式,强制生成紧凑表格
|
||
cmd = ["pandoc", docx_file, "-o", output_md,
|
||
"--to=gfm", # <--- 核心改动:使用 GitHub 风格,拒绝网格表
|
||
"--standalone",
|
||
f"--extract-media={media_folder}",
|
||
"--wrap=none",
|
||
"--markdown-headings=atx"]
|
||
|
||
try:
|
||
subprocess.run(cmd, check=True, capture_output=True)
|
||
except Exception as e:
|
||
print(f"❌ 转换出错: {e}")
|
||
return
|
||
|
||
with open(output_md, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 2. 紧凑版正则清洗
|
||
# 因为 gfm 主要是 pipe table,我们只需要匹配 |...| 结构
|
||
# 并且不仅匹配,还要把里面为了对齐而产生的多余空格给压扁
|
||
|
||
# 这一步将 "| 文本 |" 压缩为 "| 文本 |"
|
||
def compress_table_row(match):
|
||
row = match.group(0)
|
||
# 将连续的空格替换为单个空格,但保留结构
|
||
return re.sub(r' +', ' ', row)
|
||
|
||
# 匹配标准 Markdown 表格行
|
||
content = re.sub(r'^\|.*\|$', compress_table_row, content, flags=re.MULTILINE)
|
||
|
||
# 3. 标记表格 (逻辑简化,因为现在只有一种表格格式了)
|
||
tbl_count = 0
|
||
def tag_table(match):
|
||
nonlocal tbl_count
|
||
tbl_count += 1
|
||
return f"\n\n**[表格 {tbl_count}]**\n{match.group(0)}\n"
|
||
|
||
# 匹配连续的表格块
|
||
table_block_regex = r'(\|.*\|[\r\n]+)+(\|[\s\-:|]+\|[\r\n]+)(\|.*\|[\r\n]*)+'
|
||
content = re.sub(table_block_regex, tag_table, content)
|
||
|
||
# 4. 图片处理与清单 (保持不变)
|
||
img_count = 0
|
||
img_details = []
|
||
def img_replacer(m):
|
||
nonlocal img_count
|
||
img_count += 1
|
||
path = m.group(2) or m.group(3) or ""
|
||
filename = os.path.basename(path)
|
||
img_details.append(filename)
|
||
return f"\n\n**[图片 {img_count}: {filename}]**\n{m.group(0)}\n"
|
||
|
||
content = re.sub(r'(!\[.*?\]\((.*?)\)|\[.*?\]:\s*([^\s]+\.(?:png|jpg|jpeg|gif|bmp))(\s*\{.*?\})?)', img_replacer, content)
|
||
|
||
# 5. 深度清洗噪声
|
||
content = re.sub(r'\[(.*?)\]\(\\l\)', r'\1', content)
|
||
content = re.sub(r'\[(.*?)\]\[\d+\]', r'\1', content)
|
||
content = re.sub(r'\{width=.*?\}', '', content)
|
||
content = re.sub(r'\n{3,}', '\n\n', content)
|
||
|
||
# 6. 保存元信息与清单
|
||
with open(images_list_file, 'w', encoding='utf-8') as f:
|
||
f.write(f"文档 {base_name} 图片清单:\n" + "="*30 + "\n")
|
||
for i, name in enumerate(img_details, 1):
|
||
f.write(f"{i}. {name}\n")
|
||
|
||
stats = {
|
||
"filename": base_name,
|
||
"table_count": tbl_count,
|
||
"image_count": img_count,
|
||
"char_count": len(content),
|
||
# 重新预估 Token,因为去掉了空格,这个值会更准
|
||
"estimated_tokens": int(len(content) * 0.6)
|
||
}
|
||
with open(metadata_file, 'w', encoding='utf-8') as f:
|
||
json.dump(stats, f, indent=4, ensure_ascii=False)
|
||
|
||
with open(output_md, 'w', encoding='utf-8') as f:
|
||
f.write(content)
|
||
|
||
print(f"\n" + "="*60)
|
||
print(f"✅ 紧凑版转换完成!已去除冗余空格")
|
||
print(f"📁 文档: {output_md}")
|
||
print(f"📊 统计: {img_count} 图片, {tbl_count} 表格")
|
||
print(f"📉 Token 节省: 预估比原版节省 30% 空间")
|
||
print("="*60)
|
||
|
||
if __name__ == "__main__":
|
||
convert_compact_mode("1.docx") |