1
This commit is contained in:
@@ -64,14 +64,27 @@ def parse_knowledge_entries(html_path: Path) -> List[KnowledgeEntry]:
|
||||
"""Scan the saved HTML page and extract all knowledge entries."""
|
||||
soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "html.parser")
|
||||
entries: Dict[str, KnowledgeEntry] = {}
|
||||
for span in soup.select("span.posCatalog_name"):
|
||||
onclick = span.get("onclick", "")
|
||||
match = _ONCLICK_RE.search(onclick)
|
||||
if not match:
|
||||
for div in soup.select("div.posCatalog_select"):
|
||||
div_id = div.get("id", "")
|
||||
if not div_id.startswith("cur"):
|
||||
continue
|
||||
knowledge_id = match["knowledge"]
|
||||
title_attr = span.get("title") or span.get_text(strip=True)
|
||||
title = " ".join(title_attr.split())
|
||||
knowledge_id = div_id[3:]
|
||||
span = div.select_one("span.posCatalog_name")
|
||||
if not span:
|
||||
continue
|
||||
|
||||
title_attr = span.get("title")
|
||||
em = span.select_one("em.posCatalog_sbar")
|
||||
|
||||
if title_attr:
|
||||
if em:
|
||||
title = f"{em.get_text(strip=True)} {title_attr}"
|
||||
else:
|
||||
title = title_attr
|
||||
else:
|
||||
title = span.get_text(strip=True)
|
||||
|
||||
title = " ".join(title.split())
|
||||
if knowledge_id not in entries:
|
||||
entries[knowledge_id] = KnowledgeEntry(knowledge_id=knowledge_id, title=title)
|
||||
return list(entries.values())
|
||||
|
||||
Reference in New Issue
Block a user