mirror of
https://github.com/ChuXunYu/OfficeFileHandle.git
synced 2026-01-30 17:51:26 +00:00
Initial commit
This commit is contained in:
2
.gitattributes
vendored
Normal file
2
.gitattributes
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Auto detect text files and perform LF normalization
|
||||
* text=auto
|
||||
103
docx2md/docx2md.py
Normal file
103
docx2md/docx2md.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import subprocess
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def convert_compact_mode(docx_file):
|
||||
path_obj = Path(docx_file)
|
||||
base_name = path_obj.stem
|
||||
output_md = f"{base_name}_Compact.md"
|
||||
media_folder = f"{base_name}_images"
|
||||
metadata_file = f"{base_name}_metadata.json"
|
||||
images_list_file = f"{base_name}_images.txt"
|
||||
|
||||
# 1. 关键修改:使用 gfm 格式,强制生成紧凑表格
|
||||
cmd = ["pandoc", docx_file, "-o", output_md,
|
||||
"--to=gfm", # <--- 核心改动:使用 GitHub 风格,拒绝网格表
|
||||
"--standalone",
|
||||
f"--extract-media={media_folder}",
|
||||
"--wrap=none",
|
||||
"--markdown-headings=atx"]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
except Exception as e:
|
||||
print(f"❌ 转换出错: {e}")
|
||||
return
|
||||
|
||||
with open(output_md, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 2. 紧凑版正则清洗
|
||||
# 因为 gfm 主要是 pipe table,我们只需要匹配 |...| 结构
|
||||
# 并且不仅匹配,还要把里面为了对齐而产生的多余空格给压扁
|
||||
|
||||
# 这一步将 "| 文本 |" 压缩为 "| 文本 |"
|
||||
def compress_table_row(match):
|
||||
row = match.group(0)
|
||||
# 将连续的空格替换为单个空格,但保留结构
|
||||
return re.sub(r' +', ' ', row)
|
||||
|
||||
# 匹配标准 Markdown 表格行
|
||||
content = re.sub(r'^\|.*\|$', compress_table_row, content, flags=re.MULTILINE)
|
||||
|
||||
# 3. 标记表格 (逻辑简化,因为现在只有一种表格格式了)
|
||||
tbl_count = 0
|
||||
def tag_table(match):
|
||||
nonlocal tbl_count
|
||||
tbl_count += 1
|
||||
return f"\n\n**[表格 {tbl_count}]**\n{match.group(0)}\n"
|
||||
|
||||
# 匹配连续的表格块
|
||||
table_block_regex = r'(\|.*\|[\r\n]+)+(\|[\s\-:|]+\|[\r\n]+)(\|.*\|[\r\n]*)+'
|
||||
content = re.sub(table_block_regex, tag_table, content)
|
||||
|
||||
# 4. 图片处理与清单 (保持不变)
|
||||
img_count = 0
|
||||
img_details = []
|
||||
def img_replacer(m):
|
||||
nonlocal img_count
|
||||
img_count += 1
|
||||
path = m.group(2) or m.group(3) or ""
|
||||
filename = os.path.basename(path)
|
||||
img_details.append(filename)
|
||||
return f"\n\n**[图片 {img_count}: {filename}]**\n{m.group(0)}\n"
|
||||
|
||||
content = re.sub(r'(!\[.*?\]\((.*?)\)|\[.*?\]:\s*([^\s]+\.(?:png|jpg|jpeg|gif|bmp))(\s*\{.*?\})?)', img_replacer, content)
|
||||
|
||||
# 5. 深度清洗噪声
|
||||
content = re.sub(r'\[(.*?)\]\(\\l\)', r'\1', content)
|
||||
content = re.sub(r'\[(.*?)\]\[\d+\]', r'\1', content)
|
||||
content = re.sub(r'\{width=.*?\}', '', content)
|
||||
content = re.sub(r'\n{3,}', '\n\n', content)
|
||||
|
||||
# 6. 保存元信息与清单
|
||||
with open(images_list_file, 'w', encoding='utf-8') as f:
|
||||
f.write(f"文档 {base_name} 图片清单:\n" + "="*30 + "\n")
|
||||
for i, name in enumerate(img_details, 1):
|
||||
f.write(f"{i}. {name}\n")
|
||||
|
||||
stats = {
|
||||
"filename": base_name,
|
||||
"table_count": tbl_count,
|
||||
"image_count": img_count,
|
||||
"char_count": len(content),
|
||||
# 重新预估 Token,因为去掉了空格,这个值会更准
|
||||
"estimated_tokens": int(len(content) * 0.6)
|
||||
}
|
||||
with open(metadata_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(stats, f, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(output_md, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
print(f"\n" + "="*60)
|
||||
print(f"✅ 紧凑版转换完成!已去除冗余空格")
|
||||
print(f"📁 文档: {output_md}")
|
||||
print(f"📊 统计: {img_count} 图片, {tbl_count} 表格")
|
||||
print(f"📉 Token 节省: 预估比原版节省 30% 空间")
|
||||
print("="*60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
convert_compact_mode("a2.docx")
|
||||
101
pptx2md/pptx2md.py
Normal file
101
pptx2md/pptx2md.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import os
|
||||
import re
|
||||
from pptx import Presentation
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||
|
||||
# --- 新增:文本清洗函数 ---
|
||||
def clean_text(text):
|
||||
if not text:
|
||||
return ""
|
||||
# 1. 去除控制字符 (例如 \x00-\x08, \x0b, \x0c, \x0e-\x1f)
|
||||
# 保留 \t (制表符), \n (换行), \r (回车)
|
||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
||||
|
||||
# 2. 如果你想更彻底,可以去除连续的乱码块
|
||||
# (如果一行文字里非中英文符号占比过高,可能就是二进制垃圾)
|
||||
# 这里暂时只做控制字符清洗,通常足够解决 99% 的问题。
|
||||
|
||||
return text.strip()
|
||||
|
||||
def extract_pptx_fixed(pptx_path, output_md, media_folder):
|
||||
if not os.path.exists(pptx_path):
|
||||
print(f"找不到文件: {pptx_path}")
|
||||
return
|
||||
|
||||
if not os.path.exists(media_folder):
|
||||
os.makedirs(media_folder)
|
||||
|
||||
prs = Presentation(pptx_path)
|
||||
|
||||
with open(output_md, "w", encoding="utf-8") as f:
|
||||
f.write(f"# PPT 提取报告: {os.path.basename(pptx_path)}\n\n")
|
||||
|
||||
for i, slide in enumerate(prs.slides):
|
||||
f.write(f"## --- 第 {i+1} 页 ---\n\n")
|
||||
|
||||
# 1. 提取备注 (同样需要清洗)
|
||||
if slide.has_notes_slide:
|
||||
notes = slide.notes_slide.notes_text_frame.text
|
||||
cleaned_notes = clean_text(notes)
|
||||
if cleaned_notes:
|
||||
f.write(f"> **演讲者备注**: {cleaned_notes}\n\n")
|
||||
|
||||
# 2. 排序 (按视觉从上到下)
|
||||
shapes = sorted(slide.shapes, key=lambda x: (x.top, x.left))
|
||||
|
||||
for shape in shapes:
|
||||
# --- A. 提取文本 ---
|
||||
if shape.has_text_frame:
|
||||
try:
|
||||
# 逐段提取并清洗
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
para_text = ""
|
||||
for run in paragraph.runs:
|
||||
chunk = clean_text(run.text)
|
||||
if not chunk: continue
|
||||
|
||||
if run.font.bold:
|
||||
chunk = f"**{chunk}**"
|
||||
para_text += chunk
|
||||
|
||||
if para_text:
|
||||
# 识别标题
|
||||
prefix = ""
|
||||
if shape == slide.shapes.title:
|
||||
prefix = "### "
|
||||
f.write(f"{prefix}{para_text}\n")
|
||||
f.write("\n")
|
||||
except Exception as e:
|
||||
print(f"跳过第 {i+1} 页的损坏文本框: {e}")
|
||||
|
||||
# --- B. 提取表格 ---
|
||||
if shape.has_table:
|
||||
f.write("\n| PPT 表格数据 |\n| --- |\n")
|
||||
for row in shape.table.rows:
|
||||
# 清洗表格内容
|
||||
row_data = [clean_text(cell.text_frame.text).replace('\n', '<br>') for cell in row.cells]
|
||||
f.write("| " + " | ".join(row_data) + " |\n")
|
||||
f.write("\n")
|
||||
|
||||
# --- C. 提取图片 ---
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||
try:
|
||||
image = shape.image
|
||||
ext = image.ext
|
||||
# 过滤掉 weird 的图片格式,或者统一转为 jpg (可选)
|
||||
image_name = f"slide_{i+1}_img_{shape.shape_id}.{ext}"
|
||||
image_path = os.path.join(media_folder, image_name)
|
||||
|
||||
with open(image_path, "wb") as img_f:
|
||||
img_f.write(image.blob)
|
||||
|
||||
f.write(f"\n\n")
|
||||
except Exception as e:
|
||||
print(f"无法保存图片 (可能是损坏的OLE对象): {e}")
|
||||
|
||||
print(f"✅ 修复版提取完成!乱码已被过滤。保存至: {output_md}")
|
||||
|
||||
# --- 执行 ---
|
||||
if __name__ == "__main__":
|
||||
# 替换成你的文件名
|
||||
extract_pptx_fixed("1.pptx", "output_ppt_fixed.md", "ppt_media_fixed")
|
||||
4
xlsx2csv/xlsx2csv.py
Normal file
4
xlsx2csv/xlsx2csv.py
Normal file
@@ -0,0 +1,4 @@
|
||||
import pandas as pd
|
||||
# 自动处理了中文编码,避免乱码
|
||||
df = pd.read_excel("1.xlsx")
|
||||
print(df.to_csv(index=False))
|
||||
Reference in New Issue
Block a user