import subprocess import os import re import json from pathlib import Path def convert_compact_mode(docx_file): path_obj = Path(docx_file) base_name = path_obj.stem output_md = f"{base_name}_Compact.md" media_folder = f"{base_name}_images" metadata_file = f"{base_name}_metadata.json" images_list_file = f"{base_name}_images.txt" # 1. 关键修改:使用 gfm 格式,强制生成紧凑表格 cmd = ["pandoc", docx_file, "-o", output_md, "--to=gfm", # <--- 核心改动:使用 GitHub 风格,拒绝网格表 "--standalone", f"--extract-media={media_folder}", "--wrap=none", "--markdown-headings=atx"] try: subprocess.run(cmd, check=True, capture_output=True) except Exception as e: print(f"❌ 转换出错: {e}") return with open(output_md, 'r', encoding='utf-8') as f: content = f.read() # 2. 紧凑版正则清洗 # 因为 gfm 主要是 pipe table,我们只需要匹配 |...| 结构 # 并且不仅匹配,还要把里面为了对齐而产生的多余空格给压扁 # 这一步将 "| 文本 |" 压缩为 "| 文本 |" def compress_table_row(match): row = match.group(0) # 将连续的空格替换为单个空格,但保留结构 return re.sub(r' +', ' ', row) # 匹配标准 Markdown 表格行 content = re.sub(r'^\|.*\|$', compress_table_row, content, flags=re.MULTILINE) # 3. 标记表格 (逻辑简化,因为现在只有一种表格格式了) tbl_count = 0 def tag_table(match): nonlocal tbl_count tbl_count += 1 return f"\n\n**[表格 {tbl_count}]**\n{match.group(0)}\n" # 匹配连续的表格块 table_block_regex = r'(\|.*\|[\r\n]+)+(\|[\s\-:|]+\|[\r\n]+)(\|.*\|[\r\n]*)+' content = re.sub(table_block_regex, tag_table, content) # 4. 图片处理与清单 (保持不变) img_count = 0 img_details = [] def img_replacer(m): nonlocal img_count img_count += 1 path = m.group(2) or m.group(3) or "" filename = os.path.basename(path) img_details.append(filename) return f"\n\n**[图片 {img_count}: {filename}]**\n{m.group(0)}\n" content = re.sub(r'(!\[.*?\]\((.*?)\)|\[.*?\]:\s*([^\s]+\.(?:png|jpg|jpeg|gif|bmp))(\s*\{.*?\})?)', img_replacer, content) # 5. 深度清洗噪声 content = re.sub(r'\[(.*?)\]\(\\l\)', r'\1', content) content = re.sub(r'\[(.*?)\]\[\d+\]', r'\1', content) content = re.sub(r'\{width=.*?\}', '', content) content = re.sub(r'\n{3,}', '\n\n', content) # 6. 保存元信息与清单 with open(images_list_file, 'w', encoding='utf-8') as f: f.write(f"文档 {base_name} 图片清单:\n" + "="*30 + "\n") for i, name in enumerate(img_details, 1): f.write(f"{i}. {name}\n") stats = { "filename": base_name, "table_count": tbl_count, "image_count": img_count, "char_count": len(content), # 重新预估 Token,因为去掉了空格,这个值会更准 "estimated_tokens": int(len(content) * 0.6) } with open(metadata_file, 'w', encoding='utf-8') as f: json.dump(stats, f, indent=4, ensure_ascii=False) with open(output_md, 'w', encoding='utf-8') as f: f.write(content) print(f"\n" + "="*60) print(f"✅ 紧凑版转换完成!已去除冗余空格") print(f"📁 文档: {output_md}") print(f"📊 统计: {img_count} 图片, {tbl_count} 表格") print(f"📉 Token 节省: 预估比原版节省 30% 空间") print("="*60) if __name__ == "__main__": convert_compact_mode("1.docx")