2

2025-10-11 16:40:00 +08:00
parent 6c78d44a93
commit c207fd6be9
4 changed files with 341 additions and 0 deletions
--- a/handle.cpp
+++ b/handle.cpp
@@ -0,0 +1,59 @@
 #include<bits/stdc++.h>
 #include<codecvt>
 #include<windows.h>
 using namespace std;
 int main(){
    system("chcp 65001");
    // 使用Windows API遍历目录
    WIN32_FIND_DATAW findData;
    HANDLE hFind = FindFirstFileW(L"*.srt", &findData);
    if(hFind == INVALID_HANDLE_VALUE){
        cout << "没有找到需要处理的srt文件" << endl;
        return 0;
    }
    wstring_convert<codecvt_utf8<wchar_t>> converter;
    do {
        wstring wfilename = findData.cFileName;
        string filename = converter.to_bytes(wfilename);
        // 跳过已处理的文件
        if(filename.find("handled_") == 0){
            continue;
        }
        FILE* infile = _wfopen(wfilename.c_str(), L"r");
        if(!infile){
            cout << "无法打开文件: " << filename << endl;
            continue;
        }
        wstring woutname = L"handled_" + wfilename;
        FILE* outfile = _wfopen(woutname.c_str(), L"w");
        if(!outfile){
            cout << "无法创建输出文件: handled_" << filename << endl;
            fclose(infile);
            continue;
        }
        char line[1024];
        int line_count=0;
        while(fgets(line, sizeof(line), infile)){
            line_count++;
            if(line_count%4==3){
                fputs(line, outfile);
            }
        }
        fclose(infile);
        fclose(outfile);
        cout << "已处理: " << filename << endl;
    } while(FindNextFileW(hFind, &findData));
    FindClose(hFind);
    return 0;
 }
--- a/handle.exe
+++ b/handle.exe
--- a/srt_downloader.exe
+++ b/srt_downloader.exe
--- a/srt_downloader.py
+++ b/srt_downloader.py
@@ -0,0 +1,282 @@
 from __future__ import annotations
 import json
 import os
 import re
 import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional
 import requests
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 # Constants for the Chaoxing endpoints
 CARDS_URL = "https://mooc1.chaoxing.com/mooc-ans/knowledge/cards"
 # Primary endpoints visible in cards payload / player scripts
 RICH_SUBTITLE_URL = "https://mooc1.chaoxing.com/mooc-ans/richvideo/subtitle"
 RICH_ALLSUBTITLE_URL = "https://mooc1.chaoxing.com/mooc-ans/richvideo/allsubtitle"
 # Fallback (may return 500 depending on auth):
 EDITOR_SUBTITLE_URL = "https://mooc1.chaoxing.com/ananas/video-editor/sub"
 # Regex helpers
 _ONCLICK_RE = re.compile(r"getTeacherAjax\('(?P<course>\d+)',\s*'(?P<class>\d+)',\s*'(?P<knowledge>\d+)'\)")
 _JSON_RE = re.compile(r"mArg\s*=\s*(\{.*?\});", re.DOTALL)
 _INVALID_FILENAME_RE = re.compile(r"[\\/:*?\"<>|]")
@dataclass(frozen=True)
 class KnowledgeEntry:
    knowledge_id: str
    title: str
 def load_cookie_from_env(env_var: str = "CHAOXING_COOKIE") -> Dict[str, str]:
    """Parse a raw cookie string from environment variables into a dict."""
    # Load .env values into environment first
    try:
        load_dotenv()
    except Exception:
        pass
    raw_cookie = os.getenv(env_var, "").strip()
    cookies: Dict[str, str] = {}
    if not raw_cookie:
        return cookies
    for fragment in raw_cookie.split(";"):
        if not fragment.strip():
            continue
        if "=" not in fragment:
            continue
        name, value = fragment.split("=", 1)
        cookies[name.strip()] = value.strip()
    return cookies
 def sanitize_filename(name: str) -> str:
    """Turn a lesson title into a safe filename for Windows."""
    cleaned = _INVALID_FILENAME_RE.sub("_", name)
    return cleaned.strip() or "untitled"
 def parse_knowledge_entries(html_path: Path) -> List[KnowledgeEntry]:
    """Scan the saved HTML page and extract all knowledge entries."""
    soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "html.parser")
    entries: Dict[str, KnowledgeEntry] = {}
    for span in soup.select("span.posCatalog_name"):
        onclick = span.get("onclick", "")
        match = _ONCLICK_RE.search(onclick)
        if not match:
            continue
        knowledge_id = match["knowledge"]
        title_attr = span.get("title") or span.get_text(strip=True)
        title = " ".join(title_attr.split())
        if knowledge_id not in entries:
            entries[knowledge_id] = KnowledgeEntry(knowledge_id=knowledge_id, title=title)
    return list(entries.values())
 def extract_marg_json(cards_html: str) -> Optional[dict]:
    """Pull the mArg JSON payload from a cards response."""
    match = _JSON_RE.search(cards_html)
    if not match:
        return None
    json_block = match.group(1)
    # Fix JavaScript style booleans/nulls if necessary
    json_block = json_block.replace("\n", " ")
    try:
        return json.loads(json_block)
    except json.JSONDecodeError:
        # Try to sanitize single quotes inside keys by using eval-safe replacement
        sanitized = json_block
        sanitized = sanitized.replace("'", '\"')
        try:
            return json.loads(sanitized)
        except json.JSONDecodeError:
            return None
 def load_local_cards_marg(cards_dir: Path, knowledge_id: str) -> Optional[dict]:
    """Try to parse saved cards.html to get mArg when network call fails."""
    # Saved file path observed: 学生学习页面_files/cards.html
    candidate = cards_dir / "cards.html"
    if not candidate.exists():
        return None
    try:
        text = candidate.read_text(encoding="utf-8", errors="ignore")
        marg = extract_marg_json(text)
        # Optionally ensure the knowledgeid matches
        if marg and str(marg.get("defaults", {}).get("knowledgeid")) == str(knowledge_id):
            return marg
        return marg
    except Exception:
        return None
 def iter_video_attachments(marg: dict) -> Iterable[dict]:
    """Yield the attachment dicts that correspond to videos."""
    attachments = marg.get("attachments") or []
    for attachment in attachments:
        if attachment.get("type") == "video":
            yield attachment
 def is_html_error(text: str) -> bool:
    t = text.strip().lower()
    return t.startswith("<!doctype html>") and ("500" in t or "用户登录" in t)
 def fetch_cards(session: requests.Session, knowledge_id: str, base_params: Dict[str, str]) -> Optional[dict]:
    params = dict(base_params)
    params["knowledgeid"] = knowledge_id
    response = session.get(CARDS_URL, params=params, timeout=10)
    if response.status_code != 200:
        print(f"[WARN] Failed to load cards for knowledge {knowledge_id}: HTTP {response.status_code}")
        return None
    marg = extract_marg_json(response.text)
    if marg is None:
        # Fallback: try local saved cards
        cards_dir = Path("学生学习页面_files")
        marg = load_local_cards_marg(cards_dir, knowledge_id)
        if marg is None:
            print(f"[WARN] Could not locate mArg JSON for knowledge {knowledge_id}")
    return marg
 def download_subtitle(session: requests.Session, marg: dict, object_id: str, mid: Optional[str], course_id: str, output_path: Path) -> bool:
    # Try richvideo/allsubtitle first (more complete), then richvideo/subtitle, then editor fallback
    tried = []
    def try_get(url: str, params: dict) -> Optional[str]:
        tried.append((url, params))
        r = session.get(url, params=params, timeout=10)
        if r.status_code != 200:
            return None
        if not r.text.strip() or is_html_error(r.text):
            return None
        return r.text
    # Prefer objectId and mid
    subtitle_url_base = marg.get("defaults", {}).get("subtitleUrl") or RICH_SUBTITLE_URL
    # 1) allsubtitle
    params_all = {"mid": mid or marg.get("attachments", [{}])[0].get("mid"), "objectid": object_id, "courseId": course_id}
    text = try_get(RICH_ALLSUBTITLE_URL, params_all)
    if not text:
        # 2) subtitle
        params_sub = {"mid": params_all["mid"], "objectid": object_id, "courseId": course_id}
        text = try_get(RICH_SUBTITLE_URL, params_sub)
    if not text:
        # 3) fallback editor endpoint
        text = try_get(EDITOR_SUBTITLE_URL, {"objectid": object_id})
    if text:
        output_path.write_text(text, encoding="utf-8")
        print(f"[INFO] Saved subtitle to {output_path}")
        return True
    print(f"[WARN] Subtitle download failed for object {object_id}. Tried: {tried}")
    return False
 def main() -> None:
    load_dotenv()
    html_path = Path("学生学习页面.html")
    course_url = os.getenv("COURSE_URL", "").strip()
    if not html_path.exists():
        if course_url:
            print(f"[INFO] 本地HTML不存在，自动下载课程页面: {course_url}")
            cookies = load_cookie_from_env()
            session = requests.Session()
            session.headers.update({
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0 Safari/537.36",
                "Referer": course_url,
            })
            if cookies:
                session.cookies.update(cookies)
            resp = session.get(course_url, timeout=15)
            if resp.status_code == 200 and resp.text:
                html_path.write_text(resp.text, encoding="utf-8")
                print(f"[INFO] 已保存课程页面到 {html_path}")
            else:
                raise SystemExit(f"下载课程页面失败: HTTP {resp.status_code}")
        else:
            raise SystemExit("学生学习页面.html not found, 且未配置 COURSE_URL")
    entries = parse_knowledge_entries(html_path)
    if not entries:
        raise SystemExit("No knowledge entries found in saved HTML")
    print(f"[INFO] Found {len(entries)} lessons")
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0 Safari/537.36",
        "Referer": "https://mooc1.chaoxing.com/",
    })
    cookies = load_cookie_from_env()
    if cookies:
        session.cookies.update(cookies)
        print(f"[INFO] Loaded {len(cookies)} cookies from environment")
    else:
        print("[WARN] No cookies provided. Requests may fail if authentication is required.")
    base_params = {
        "clazzid": "129437493",
        "courseid": "256005147",
        "num": "0",
        "ut": "s",
        "cpi": "441843723",
        "mooc2": "1",
        "isMicroCourse": "",
        "editorPreview": "",
    }
    # 步骤1：下载字幕 JSON
    output_dir = Path("subtitles")
    output_dir.mkdir(exist_ok=True)
    for entry in entries:
        print(f"[INFO] Processing {entry.title} (knowledge {entry.knowledge_id})")
        marg = fetch_cards(session, entry.knowledge_id, base_params)
        if not marg:
            continue
        video_found = False
        for attachment in iter_video_attachments(marg):
            video_found = True
            object_id = attachment.get("objectId") or attachment.get("property", {}).get("objectid")
            mid = attachment.get("mid") or attachment.get("property", {}).get("mid")
            if not object_id:
                print(f"[WARN] No objectId for lesson {entry.title}")
                continue
            filename = sanitize_filename(entry.title) + ".txt"
            output_path = output_dir / filename
            if download_subtitle(session, marg, object_id, mid, base_params["courseid"], output_path):
                time.sleep(0.5)
        if not video_found:
            print(f"[WARN] No video attachments found for {entry.title}")
    # 步骤2：批量下载 SRT 文件
    srt_dir = Path("srt")
    srt_dir.mkdir(exist_ok=True)
    def fetch_srt(url: str, session: requests.Session) -> str | None:
        r = session.get(url, timeout=10)
        if r.status_code == 200 and r.text.strip():
            return r.text
        return None
    for txt_file in output_dir.glob("*.txt"):
        try:
            content = txt_file.read_text(encoding="utf-8").strip()
            data = json.loads(content)
            # data is list of tracks
            for track in data:
                url = track.get("url")
                if not url:
                    continue
                srt_text = fetch_srt(url, session)
                if not srt_text:
                    print(f"[WARN] Failed to fetch SRT from {url}")
                    continue
                name = txt_file.stem + ".srt"
                out_path = srt_dir / name
                out_path.write_text(srt_text, encoding="utf-8")
                print(f"[INFO] Saved {out_path}")
        except Exception as e:
            print(f"[WARN] {txt_file.name}: {e}")
 if __name__ == "__main__":
    main()