From 95028f8070647a980c6c676235f91783d4c5544f Mon Sep 17 00:00:00 2001 From: ChuXun <70203584+ChuXunYu@users.noreply.github.com> Date: Wed, 24 Dec 2025 21:38:40 +0800 Subject: [PATCH] Initial commit --- .gitattributes | 2 + docx2md/docx2md.py | 103 +++++++++++++++++++++++++++++++++++++++++++ pptx2md/pptx2md.py | 101 ++++++++++++++++++++++++++++++++++++++++++ xlsx2csv/xlsx2csv.py | 4 ++ 4 files changed, 210 insertions(+) create mode 100644 .gitattributes create mode 100644 docx2md/docx2md.py create mode 100644 pptx2md/pptx2md.py create mode 100644 xlsx2csv/xlsx2csv.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/docx2md/docx2md.py b/docx2md/docx2md.py new file mode 100644 index 0000000..82e909a --- /dev/null +++ b/docx2md/docx2md.py @@ -0,0 +1,103 @@ +import subprocess +import os +import re +import json +from pathlib import Path + +def convert_compact_mode(docx_file): + path_obj = Path(docx_file) + base_name = path_obj.stem + output_md = f"{base_name}_Compact.md" + media_folder = f"{base_name}_images" + metadata_file = f"{base_name}_metadata.json" + images_list_file = f"{base_name}_images.txt" + + # 1. 关键修改:使用 gfm 格式,强制生成紧凑表格 + cmd = ["pandoc", docx_file, "-o", output_md, + "--to=gfm", # <--- 核心改动:使用 GitHub 风格,拒绝网格表 + "--standalone", + f"--extract-media={media_folder}", + "--wrap=none", + "--markdown-headings=atx"] + + try: + subprocess.run(cmd, check=True, capture_output=True) + except Exception as e: + print(f"❌ 转换出错: {e}") + return + + with open(output_md, 'r', encoding='utf-8') as f: + content = f.read() + + # 2. 紧凑版正则清洗 + # 因为 gfm 主要是 pipe table,我们只需要匹配 |...| 结构 + # 并且不仅匹配,还要把里面为了对齐而产生的多余空格给压扁 + + # 这一步将 "| 文本 |" 压缩为 "| 文本 |" + def compress_table_row(match): + row = match.group(0) + # 将连续的空格替换为单个空格,但保留结构 + return re.sub(r' +', ' ', row) + + # 匹配标准 Markdown 表格行 + content = re.sub(r'^\|.*\|$', compress_table_row, content, flags=re.MULTILINE) + + # 3. 标记表格 (逻辑简化,因为现在只有一种表格格式了) + tbl_count = 0 + def tag_table(match): + nonlocal tbl_count + tbl_count += 1 + return f"\n\n**[表格 {tbl_count}]**\n{match.group(0)}\n" + + # 匹配连续的表格块 + table_block_regex = r'(\|.*\|[\r\n]+)+(\|[\s\-:|]+\|[\r\n]+)(\|.*\|[\r\n]*)+' + content = re.sub(table_block_regex, tag_table, content) + + # 4. 图片处理与清单 (保持不变) + img_count = 0 + img_details = [] + def img_replacer(m): + nonlocal img_count + img_count += 1 + path = m.group(2) or m.group(3) or "" + filename = os.path.basename(path) + img_details.append(filename) + return f"\n\n**[图片 {img_count}: {filename}]**\n{m.group(0)}\n" + + content = re.sub(r'(!\[.*?\]\((.*?)\)|\[.*?\]:\s*([^\s]+\.(?:png|jpg|jpeg|gif|bmp))(\s*\{.*?\})?)', img_replacer, content) + + # 5. 深度清洗噪声 + content = re.sub(r'\[(.*?)\]\(\\l\)', r'\1', content) + content = re.sub(r'\[(.*?)\]\[\d+\]', r'\1', content) + content = re.sub(r'\{width=.*?\}', '', content) + content = re.sub(r'\n{3,}', '\n\n', content) + + # 6. 保存元信息与清单 + with open(images_list_file, 'w', encoding='utf-8') as f: + f.write(f"文档 {base_name} 图片清单:\n" + "="*30 + "\n") + for i, name in enumerate(img_details, 1): + f.write(f"{i}. {name}\n") + + stats = { + "filename": base_name, + "table_count": tbl_count, + "image_count": img_count, + "char_count": len(content), + # 重新预估 Token,因为去掉了空格,这个值会更准 + "estimated_tokens": int(len(content) * 0.6) + } + with open(metadata_file, 'w', encoding='utf-8') as f: + json.dump(stats, f, indent=4, ensure_ascii=False) + + with open(output_md, 'w', encoding='utf-8') as f: + f.write(content) + + print(f"\n" + "="*60) + print(f"✅ 紧凑版转换完成!已去除冗余空格") + print(f"📁 文档: {output_md}") + print(f"📊 统计: {img_count} 图片, {tbl_count} 表格") + print(f"📉 Token 节省: 预估比原版节省 30% 空间") + print("="*60) + +if __name__ == "__main__": + convert_compact_mode("a2.docx") \ No newline at end of file diff --git a/pptx2md/pptx2md.py b/pptx2md/pptx2md.py new file mode 100644 index 0000000..ad184aa --- /dev/null +++ b/pptx2md/pptx2md.py @@ -0,0 +1,101 @@ +import os +import re +from pptx import Presentation +from pptx.enum.shapes import MSO_SHAPE_TYPE + +# --- 新增:文本清洗函数 --- +def clean_text(text): + if not text: + return "" + # 1. 去除控制字符 (例如 \x00-\x08, \x0b, \x0c, \x0e-\x1f) + # 保留 \t (制表符), \n (换行), \r (回车) + text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) + + # 2. 如果你想更彻底,可以去除连续的乱码块 + # (如果一行文字里非中英文符号占比过高,可能就是二进制垃圾) + # 这里暂时只做控制字符清洗,通常足够解决 99% 的问题。 + + return text.strip() + +def extract_pptx_fixed(pptx_path, output_md, media_folder): + if not os.path.exists(pptx_path): + print(f"找不到文件: {pptx_path}") + return + + if not os.path.exists(media_folder): + os.makedirs(media_folder) + + prs = Presentation(pptx_path) + + with open(output_md, "w", encoding="utf-8") as f: + f.write(f"# PPT 提取报告: {os.path.basename(pptx_path)}\n\n") + + for i, slide in enumerate(prs.slides): + f.write(f"## --- 第 {i+1} 页 ---\n\n") + + # 1. 提取备注 (同样需要清洗) + if slide.has_notes_slide: + notes = slide.notes_slide.notes_text_frame.text + cleaned_notes = clean_text(notes) + if cleaned_notes: + f.write(f"> **演讲者备注**: {cleaned_notes}\n\n") + + # 2. 排序 (按视觉从上到下) + shapes = sorted(slide.shapes, key=lambda x: (x.top, x.left)) + + for shape in shapes: + # --- A. 提取文本 --- + if shape.has_text_frame: + try: + # 逐段提取并清洗 + for paragraph in shape.text_frame.paragraphs: + para_text = "" + for run in paragraph.runs: + chunk = clean_text(run.text) + if not chunk: continue + + if run.font.bold: + chunk = f"**{chunk}**" + para_text += chunk + + if para_text: + # 识别标题 + prefix = "" + if shape == slide.shapes.title: + prefix = "### " + f.write(f"{prefix}{para_text}\n") + f.write("\n") + except Exception as e: + print(f"跳过第 {i+1} 页的损坏文本框: {e}") + + # --- B. 提取表格 --- + if shape.has_table: + f.write("\n| PPT 表格数据 |\n| --- |\n") + for row in shape.table.rows: + # 清洗表格内容 + row_data = [clean_text(cell.text_frame.text).replace('\n', '
') for cell in row.cells] + f.write("| " + " | ".join(row_data) + " |\n") + f.write("\n") + + # --- C. 提取图片 --- + if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: + try: + image = shape.image + ext = image.ext + # 过滤掉 weird 的图片格式,或者统一转为 jpg (可选) + image_name = f"slide_{i+1}_img_{shape.shape_id}.{ext}" + image_path = os.path.join(media_folder, image_name) + + with open(image_path, "wb") as img_f: + img_f.write(image.blob) + + f.write(f"![幻灯片图片]({media_folder}/{image_name})\n\n") + except Exception as e: + print(f"无法保存图片 (可能是损坏的OLE对象): {e}") + + print(f"✅ 修复版提取完成!乱码已被过滤。保存至: {output_md}") + +# --- 执行 --- +if __name__ == "__main__": + # 替换成你的文件名 + extract_pptx_fixed("1.pptx", "output_ppt_fixed.md", "ppt_media_fixed") \ No newline at end of file diff --git a/xlsx2csv/xlsx2csv.py b/xlsx2csv/xlsx2csv.py new file mode 100644 index 0000000..ff6f640 --- /dev/null +++ b/xlsx2csv/xlsx2csv.py @@ -0,0 +1,4 @@ +import pandas as pd +# 自动处理了中文编码,避免乱码 +df = pd.read_excel("1.xlsx") +print(df.to_csv(index=False)) \ No newline at end of file