import os import re from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE # --- 新增:文本清洗函数 --- def clean_text(text): if not text: return "" # 1. 去除控制字符 (例如 \x00-\x08, \x0b, \x0c, \x0e-\x1f) # 保留 \t (制表符), \n (换行), \r (回车) text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) # 2. 如果你想更彻底,可以去除连续的乱码块 # (如果一行文字里非中英文符号占比过高,可能就是二进制垃圾) # 这里暂时只做控制字符清洗,通常足够解决 99% 的问题。 return text.strip() def extract_pptx_fixed(pptx_path, output_md, media_folder): if not os.path.exists(pptx_path): print(f"找不到文件: {pptx_path}") return if not os.path.exists(media_folder): os.makedirs(media_folder) prs = Presentation(pptx_path) with open(output_md, "w", encoding="utf-8") as f: f.write(f"# PPT 提取报告: {os.path.basename(pptx_path)}\n\n") for i, slide in enumerate(prs.slides): f.write(f"## --- 第 {i+1} 页 ---\n\n") # 1. 提取备注 (同样需要清洗) if slide.has_notes_slide: notes = slide.notes_slide.notes_text_frame.text cleaned_notes = clean_text(notes) if cleaned_notes: f.write(f"> **演讲者备注**: {cleaned_notes}\n\n") # 2. 排序 (按视觉从上到下) shapes = sorted(slide.shapes, key=lambda x: (x.top, x.left)) for shape in shapes: # --- A. 提取文本 --- if shape.has_text_frame: try: # 逐段提取并清洗 for paragraph in shape.text_frame.paragraphs: para_text = "" for run in paragraph.runs: chunk = clean_text(run.text) if not chunk: continue if run.font.bold: chunk = f"**{chunk}**" para_text += chunk if para_text: # 识别标题 prefix = "" if shape == slide.shapes.title: prefix = "### " f.write(f"{prefix}{para_text}\n") f.write("\n") except Exception as e: print(f"跳过第 {i+1} 页的损坏文本框: {e}") # --- B. 提取表格 --- if shape.has_table: f.write("\n| PPT 表格数据 |\n| --- |\n") for row in shape.table.rows: # 清洗表格内容 row_data = [clean_text(cell.text_frame.text).replace('\n', '
') for cell in row.cells] f.write("| " + " | ".join(row_data) + " |\n") f.write("\n") # --- C. 提取图片 --- if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: try: image = shape.image ext = image.ext # 过滤掉 weird 的图片格式,或者统一转为 jpg (可选) image_name = f"slide_{i+1}_img_{shape.shape_id}.{ext}" image_path = os.path.join(media_folder, image_name) with open(image_path, "wb") as img_f: img_f.write(image.blob) f.write(f"![幻灯片图片]({media_folder}/{image_name})\n\n") except Exception as e: print(f"无法保存图片 (可能是损坏的OLE对象): {e}") print(f"✅ 修复版提取完成!乱码已被过滤。保存至: {output_md}") # --- 执行 --- if __name__ == "__main__": # 替换成你的文件名 extract_pptx_fixed("1.pptx", "output_ppt_fixed.md", "ppt_media_fixed")