Stream contents.tsv.zst and extract matching man pages as .gz files. The contents TSV has columns: id, hash, content Streams via zstd to avoid loading the full file into memory. Gz headers are written with ``mtime`` set to the UTC epoch of the chosen pkgver's ``released`` date
(data_dir, content_to_manpages, content_to_released, output_dir)
| 593 | |
| 594 | |
| 595 | def extract_contents(data_dir, content_to_manpages, content_to_released, output_dir): |
| 596 | """ |
| 597 | Stream contents.tsv.zst and extract matching man pages as .gz files. |
| 598 | |
| 599 | The contents TSV has columns: id, hash, content |
| 600 | Streams via zstd to avoid loading the full file into memory. |
| 601 | |
| 602 | Gz headers are written with ``mtime`` set to the UTC epoch of the |
| 603 | chosen pkgver's ``released`` date — this matches the Last-Modified |
| 604 | semantics manned.org serves and is fully deterministic across runs. |
| 605 | """ |
| 606 | contents_zst = os.path.join(data_dir, "contents.tsv.zst") |
| 607 | logger.info("Streaming %s ...", contents_zst) |
| 608 | |
| 609 | zstd_proc = subprocess.Popen( |
| 610 | ["zstd", "-d", "-c", contents_zst], |
| 611 | stdout=subprocess.PIPE, |
| 612 | ) |
| 613 | |
| 614 | sections_created = set() |
| 615 | extracted = 0 |
| 616 | symlinks_created = 0 |
| 617 | total_needed = len(content_to_manpages) |
| 618 | remaining = set(content_to_manpages.keys()) |
| 619 | |
| 620 | for line_bytes in zstd_proc.stdout: |
| 621 | if not remaining: |
| 622 | break |
| 623 | |
| 624 | try: |
| 625 | line = line_bytes.decode("utf-8", errors="replace") |
| 626 | except Exception: |
| 627 | continue |
| 628 | |
| 629 | # Split on first two tabs only (content may contain tabs) |
| 630 | parts = line.split("\t", 2) |
| 631 | if len(parts) < 3: |
| 632 | continue |
| 633 | |
| 634 | try: |
| 635 | content_id = int(parts[0]) |
| 636 | except ValueError: |
| 637 | continue |
| 638 | |
| 639 | if content_id not in remaining: |
| 640 | continue |
| 641 | |
| 642 | # Unescape PostgreSQL COPY format |
| 643 | raw_content = parts[2] |
| 644 | raw_content = ( |
| 645 | raw_content.replace("\\n", "\n").replace("\\t", "\t").replace("\\\\", "\\") |
| 646 | ) |
| 647 | # Remove trailing newline from the TSV row itself |
| 648 | if raw_content.endswith("\n"): |
| 649 | raw_content = raw_content[:-1] |
| 650 | |
| 651 | # Write out as .gz files; first entry is the real file, rest are symlinks |
| 652 | pages = content_to_manpages[content_id] |
no test coverage detected