Parse all metadata files. Returns parsed data structures.
(data_dir)
| 354 | |
| 355 | |
| 356 | def load_metadata(data_dir): |
| 357 | """Parse all metadata files. Returns parsed data structures.""" |
| 358 | # Parse systems: id -> (name, release, short) |
| 359 | systems = {} |
| 360 | for row in parse_tsv(os.path.join(data_dir, "systems.tsv")): |
| 361 | sys_id = int(row[0]) |
| 362 | name = row[1] |
| 363 | release = row[2] if len(row) > 2 else "" |
| 364 | short = row[3] if len(row) > 3 else "" |
| 365 | # Normalize PostgreSQL COPY NULL marker |
| 366 | if release == "\\N": |
| 367 | release = "" |
| 368 | if short == "\\N": |
| 369 | short = "" |
| 370 | systems[sys_id] = (name, release, short) |
| 371 | logger.info("Loaded %d systems", len(systems)) |
| 372 | |
| 373 | # Parse locales: id -> locale |
| 374 | locales = {} |
| 375 | for row in parse_tsv(os.path.join(data_dir, "locales.tsv")): |
| 376 | loc_id = int(row[0]) |
| 377 | locale = row[1] |
| 378 | locales[loc_id] = locale |
| 379 | logger.info("Loaded %d locales", len(locales)) |
| 380 | |
| 381 | # Determine English locale IDs |
| 382 | english_locale_ids = set() |
| 383 | for loc_id, locale in locales.items(): |
| 384 | if locale == "" or locale.startswith("en"): |
| 385 | english_locale_ids.add(loc_id) |
| 386 | logger.info("English locale IDs: %s", english_locale_ids) |
| 387 | |
| 388 | # Parse mans: id -> (name, section) |
| 389 | mans = {} |
| 390 | for row in parse_tsv(os.path.join(data_dir, "mans.tsv")): |
| 391 | man_id = int(row[0]) |
| 392 | name, section = row[1], row[2] |
| 393 | mans[man_id] = (name, section) |
| 394 | logger.info("Loaded %d man page entries", len(mans)) |
| 395 | |
| 396 | # Parse packages: id -> system_id |
| 397 | packages = {} |
| 398 | for row in parse_tsv(os.path.join(data_dir, "packages.tsv")): |
| 399 | pkg_id = int(row[0]) |
| 400 | system = int(row[1]) |
| 401 | packages[pkg_id] = system |
| 402 | logger.info("Loaded %d packages", len(packages)) |
| 403 | |
| 404 | # Parse package_versions: id -> (package_id, released_tuple) |
| 405 | # `released` (column 4) is a YYYY-MM-DD date that feeds the |
| 406 | # manned-style selector ranking — within a package we prefer the |
| 407 | # pkgver with the latest release date. |
| 408 | pkg_versions: dict[int, tuple[int, tuple[int, int, int]]] = {} |
| 409 | for row in parse_tsv(os.path.join(data_dir, "package_versions.tsv")): |
| 410 | pv_id = int(row[0]) |
| 411 | pkg_id = int(row[1]) |
| 412 | released = _parse_date(row[3]) if len(row) > 3 else (0, 0, 0) |
| 413 | pkg_versions[pv_id] = (pkg_id, released) |
no test coverage detected