新增识别已完成课程并跳过抓取

This commit is contained in:
2025-12-12 14:59:24 +08:00
parent 57f4ae5006
commit d615bc7eb2
2 changed files with 1601 additions and 4 deletions

31
main.py
View File

@@ -397,12 +397,35 @@ if not course_data:
try:
resp = requests.get(url, headers=headers)
resp.raise_for_status()
# 正则匹配课程ID
course_ids = re.findall(
r"window\.location\s*=\s*['\"]/lms/web/course/detail\?id=(\d+)['\"]", resp.text)
# 优化解析HTML排除已完成的课程
# 查找所有课程块提取ID和内容
# 模式匹配onclick="window.location='/lms/web/course/detail?id=ID'" ... <div class="txt vtop">
# 使用非贪婪匹配 .*? 确保只匹配当前课程块
matches = re.findall(
r"onclick=\"window\.location='/lms/web/course/detail\?id=(\d+)'\"[^>]*>(.*?)<div class=\"txt vtop\"",
resp.text,
re.DOTALL
)
course_ids = []
if matches:
print(f"[*] 解析到 {len(matches)} 个课程块,正在过滤已完成课程...")
for cid, content in matches:
# 检查是否存在“已完成”的标记图片 (22cn_03.png)
if "22cn_03.png" in content:
print(f" [-] 跳过已完成课程 (ID: {cid})")
else:
course_ids.append(cid)
# 如果上面的复杂正则没匹配到任何东西(可能页面结构变了),回退到简单正则
if not matches and not course_ids:
print("[!] 未能通过高级过滤匹配到课程,尝试使用基础匹配(将包含已完成课程)...")
course_ids = re.findall(
r"window\.location\s*=\s*['\"]/lms/web/course/detail\?id=(\d+)['\"]", resp.text)
if course_ids:
print(f"成功获取到 {len(course_ids)} 门课程。")
print(f"成功获取到 {len(course_ids)}未完成课程。")
with open(cache_file, "w", encoding="utf-8") as f:
json.dump(course_ids, f, indent=2)
else: