| 373 | |
| 374 | |
| 375 | def clean_tree( |
| 376 | tree: ET.ElementTree, |
| 377 | path: Path, |
| 378 | check_results: Dict[str, FeedCheckResult], |
| 379 | removable_urls: Set[str], |
| 380 | ) -> Tuple[int, int, int]: |
| 381 | body = get_body(tree, path) |
| 382 | seen_urls: Set[str] = set() |
| 383 | dead_removed = 0 |
| 384 | dup_removed = 0 |
| 385 | retained_failed = 0 |
| 386 | |
| 387 | def visit(parent: ET.Element) -> None: |
| 388 | nonlocal dead_removed, dup_removed, retained_failed |
| 389 | for child in list(parent): |
| 390 | if child.tag != "outline": |
| 391 | continue |
| 392 | if is_rss_outline(child): |
| 393 | raw = child.attrib.get("xmlUrl", "") |
| 394 | url = normalize_url(raw) |
| 395 | if not url: |
| 396 | parent.remove(child) |
| 397 | dead_removed += 1 |
| 398 | continue |
| 399 | if raw != url: |
| 400 | child.attrib["xmlUrl"] = url |
| 401 | |
| 402 | result = check_results.get( |
| 403 | url, |
| 404 | FeedCheckResult(alive=False, kind="transient_fail", reason="missing_check_result"), |
| 405 | ) |
| 406 | should_remove_dead = (not result.alive) and (url in removable_urls) |
| 407 | if should_remove_dead: |
| 408 | parent.remove(child) |
| 409 | dead_removed += 1 |
| 410 | continue |
| 411 | |
| 412 | if not result.alive: |
| 413 | retained_failed += 1 |
| 414 | |
| 415 | if url in seen_urls: |
| 416 | parent.remove(child) |
| 417 | dup_removed += 1 |
| 418 | continue |
| 419 | seen_urls.add(url) |
| 420 | continue |
| 421 | visit(child) |
| 422 | |
| 423 | visit(body) |
| 424 | return dead_removed, dup_removed, retained_failed |
| 425 | |
| 426 | |
| 427 | def top_level_categories(body: ET.Element) -> List[ET.Element]: |