新增识别已完成课程并跳过抓取
This commit is contained in:
1574
index.html
Normal file
1574
index.html
Normal file
File diff suppressed because it is too large
Load Diff
31
main.py
31
main.py
@@ -397,12 +397,35 @@ if not course_data:
|
||||
try:
|
||||
resp = requests.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
# 正则匹配课程ID
|
||||
course_ids = re.findall(
|
||||
r"window\.location\s*=\s*['\"]/lms/web/course/detail\?id=(\d+)['\"]", resp.text)
|
||||
|
||||
# 优化:解析HTML,排除已完成的课程
|
||||
# 查找所有课程块,提取ID和内容
|
||||
# 模式匹配:onclick="window.location='/lms/web/course/detail?id=ID'" ... <div class="txt vtop">
|
||||
# 使用非贪婪匹配 .*? 确保只匹配当前课程块
|
||||
matches = re.findall(
|
||||
r"onclick=\"window\.location='/lms/web/course/detail\?id=(\d+)'\"[^>]*>(.*?)<div class=\"txt vtop\"",
|
||||
resp.text,
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
course_ids = []
|
||||
if matches:
|
||||
print(f"[*] 解析到 {len(matches)} 个课程块,正在过滤已完成课程...")
|
||||
for cid, content in matches:
|
||||
# 检查是否存在“已完成”的标记图片 (22cn_03.png)
|
||||
if "22cn_03.png" in content:
|
||||
print(f" [-] 跳过已完成课程 (ID: {cid})")
|
||||
else:
|
||||
course_ids.append(cid)
|
||||
|
||||
# 如果上面的复杂正则没匹配到任何东西(可能页面结构变了),回退到简单正则
|
||||
if not matches and not course_ids:
|
||||
print("[!] 未能通过高级过滤匹配到课程,尝试使用基础匹配(将包含已完成课程)...")
|
||||
course_ids = re.findall(
|
||||
r"window\.location\s*=\s*['\"]/lms/web/course/detail\?id=(\d+)['\"]", resp.text)
|
||||
|
||||
if course_ids:
|
||||
print(f"成功获取到 {len(course_ids)} 门课程。")
|
||||
print(f"成功获取到 {len(course_ids)} 门未完成课程。")
|
||||
with open(cache_file, "w", encoding="utf-8") as f:
|
||||
json.dump(course_ids, f, indent=2)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user