This commit is contained in:
ChuXun
2026-01-18 18:53:17 +08:00
parent ab59c89fe4
commit 1aacbb1d18
172 changed files with 50112 additions and 1003 deletions

View File

@@ -64,14 +64,27 @@ def parse_knowledge_entries(html_path: Path) -> List[KnowledgeEntry]:
"""Scan the saved HTML page and extract all knowledge entries."""
soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "html.parser")
entries: Dict[str, KnowledgeEntry] = {}
for span in soup.select("span.posCatalog_name"):
onclick = span.get("onclick", "")
match = _ONCLICK_RE.search(onclick)
if not match:
for div in soup.select("div.posCatalog_select"):
div_id = div.get("id", "")
if not div_id.startswith("cur"):
continue
knowledge_id = match["knowledge"]
title_attr = span.get("title") or span.get_text(strip=True)
title = " ".join(title_attr.split())
knowledge_id = div_id[3:]
span = div.select_one("span.posCatalog_name")
if not span:
continue
title_attr = span.get("title")
em = span.select_one("em.posCatalog_sbar")
if title_attr:
if em:
title = f"{em.get_text(strip=True)} {title_attr}"
else:
title = title_attr
else:
title = span.get_text(strip=True)
title = " ".join(title.split())
if knowledge_id not in entries:
entries[knowledge_id] = KnowledgeEntry(knowledge_id=knowledge_id, title=title)
return list(entries.values())