Docx/pptx2md/pptx2md.py

import os
import re
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE

# --- 新增：文本清洗函数 ---
def clean_text(text):
    if not text:
        return ""
    # 1. 去除控制字符 (例如 \x00-\x08, \x0b, \x0c, \x0e-\x1f)
    #    保留 \t (制表符), \n (换行), \r (回车)
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)

    # 2. 如果你想更彻底，可以去除连续的乱码块
    #    (如果一行文字里非中英文符号占比过高，可能就是二进制垃圾)
    #    这里暂时只做控制字符清洗，通常足够解决 99% 的问题。

    return text.strip()

def extract_pptx_fixed(pptx_path, output_md, media_folder):
    if not os.path.exists(pptx_path):
        print(f"找不到文件: {pptx_path}")
        return

    if not os.path.exists(media_folder):
        os.makedirs(media_folder)

    prs = Presentation(pptx_path)

    with open(output_md, "w", encoding="utf-8") as f:
        f.write(f"# PPT 提取报告: {os.path.basename(pptx_path)}\n\n")

        for i, slide in enumerate(prs.slides):
            f.write(f"## --- 第 {i+1} 页 ---\n\n")

            # 1. 提取备注 (同样需要清洗)
            if slide.has_notes_slide:
                notes = slide.notes_slide.notes_text_frame.text
                cleaned_notes = clean_text(notes)
                if cleaned_notes:
                    f.write(f"> **演讲者备注**: {cleaned_notes}\n\n")

            # 2. 排序 (按视觉从上到下)
            shapes = sorted(slide.shapes, key=lambda x: (x.top, x.left))

            for shape in shapes:
                # --- A. 提取文本 ---
                if shape.has_text_frame:
                    try:
                        # 逐段提取并清洗
                        for paragraph in shape.text_frame.paragraphs:
                            para_text = ""
                            for run in paragraph.runs:
                                chunk = clean_text(run.text)
                                if not chunk: continue

                                if run.font.bold:
                                    chunk = f"**{chunk}**"
                                para_text += chunk

                            if para_text:
                                # 识别标题
                                prefix = ""
                                if shape == slide.shapes.title:
                                    prefix = "### "
                                f.write(f"{prefix}{para_text}\n")
                        f.write("\n")
                    except Exception as e:
                        print(f"跳过第 {i+1} 页的损坏文本框: {e}")

                # --- B. 提取表格 ---
                if shape.has_table:
                    f.write("\n| PPT 表格数据 |\n| --- |\n")
                    for row in shape.table.rows:
                        # 清洗表格内容
                        row_data = [clean_text(cell.text_frame.text).replace('\n', '<br>') for cell in row.cells]
                        f.write("| " + " | ".join(row_data) + " |\n")
                    f.write("\n")

                # --- C. 提取图片 ---
                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                    try:
                        image = shape.image
                        ext = image.ext
                        # 过滤掉 weird 的图片格式，或者统一转为 jpg (可选)
                        image_name = f"slide_{i+1}_img_{shape.shape_id}.{ext}"
                        image_path = os.path.join(media_folder, image_name)

                        with open(image_path, "wb") as img_f:
                            img_f.write(image.blob)

                        f.write(f"![幻灯片图片]({media_folder}/{image_name})\n\n")
                    except Exception as e:
                        print(f"无法保存图片 (可能是损坏的OLE对象): {e}")

    print(f"✅ 修复版提取完成！乱码已被过滤。保存至: {output_md}")

# --- 执行 ---
if __name__ == "__main__":
    # 替换成你的文件名
    extract_pptx_fixed("1.pptx", "output_ppt_fixed.md", "ppt_media_fixed")