新增识别已完成课程并跳过抓取

2025-12-12 14:59:24 +08:00
parent 57f4ae5006
commit d615bc7eb2
2 changed files with 1601 additions and 4 deletions
--- a/main.py
+++ b/main.py
@@ -397,12 +397,35 @@ if not course_data:
        try:
            resp = requests.get(url, headers=headers)
            resp.raise_for_status()
-            # 正则匹配课程ID
-            course_ids = re.findall(
-                r"window\.location\s*=\s*['\"]/lms/web/course/detail\?id=(\d+)['\"]", resp.text)
+
+            # 优化：解析HTML，排除已完成的课程
+            # 查找所有课程块，提取ID和内容
+            # 模式匹配：onclick="window.location='/lms/web/course/detail?id=ID'" ... <div class="txt vtop">
+            # 使用非贪婪匹配 .*? 确保只匹配当前课程块
+            matches = re.findall(
+                r"onclick=\"window\.location='/lms/web/course/detail\?id=(\d+)'\"[^>]*>(.*?)<div class=\"txt vtop\"",
+                resp.text,
+                re.DOTALL
+            )
+
+            course_ids = []
+            if matches:
+                print(f"[*] 解析到 {len(matches)} 个课程块，正在过滤已完成课程...")
+                for cid, content in matches:
+                    # 检查是否存在“已完成”的标记图片 (22cn_03.png)
+                    if "22cn_03.png" in content:
+                        print(f"    [-] 跳过已完成课程 (ID: {cid})")
+                    else:
+                        course_ids.append(cid)
+
+            # 如果上面的复杂正则没匹配到任何东西（可能页面结构变了），回退到简单正则
+            if not matches and not course_ids:
+                print("[!] 未能通过高级过滤匹配到课程，尝试使用基础匹配（将包含已完成课程）...")
+                course_ids = re.findall(
+                    r"window\.location\s*=\s*['\"]/lms/web/course/detail\?id=(\d+)['\"]", resp.text)

            if course_ids:
-                print(f"成功获取到 {len(course_ids)} 门课程。")
+                print(f"成功获取到 {len(course_ids)} 门未完成课程。")
                with open(cache_file, "w", encoding="utf-8") as f:
                    json.dump(course_ids, f, indent=2)
            else: