| 574 | |
| 575 | |
| 576 | def scan(projects_dir=None, projects_dirs=None, db_path=DB_PATH, verbose=True): |
| 577 | conn = get_db(db_path) |
| 578 | init_db(conn) |
| 579 | |
| 580 | if projects_dirs: |
| 581 | dirs_to_scan = [Path(d) for d in projects_dirs] |
| 582 | elif projects_dir: |
| 583 | dirs_to_scan = [Path(projects_dir)] |
| 584 | else: |
| 585 | dirs_to_scan = DEFAULT_PROJECTS_DIRS |
| 586 | |
| 587 | jsonl_files = [] |
| 588 | for d in dirs_to_scan: |
| 589 | if not d.exists(): |
| 590 | continue |
| 591 | if verbose: |
| 592 | print(f"Scanning {d} ...") |
| 593 | jsonl_files.extend(glob.glob(str(d / "**" / "*.jsonl"), recursive=True)) |
| 594 | jsonl_files.sort() |
| 595 | |
| 596 | # One-time topic backfill for DBs whose sessions predate topic support: fill |
| 597 | # topics from title records in already-processed transcripts that an |
| 598 | # incremental scan would otherwise never revisit. Runs once, gated by the |
| 599 | # schema_meta 'topic_backfill_done' marker. It runs before the main loop, so |
| 600 | # on a fresh DB the sessions table is still empty and this no-ops; only DBs |
| 601 | # with pre-existing untitled sessions do real work. |
| 602 | if _meta_get(conn, "topic_backfill_done") != "1": |
| 603 | filled = _backfill_topics(conn, jsonl_files) |
| 604 | _meta_set(conn, "topic_backfill_done", "1") |
| 605 | conn.commit() |
| 606 | if verbose and filled: |
| 607 | print(f"Backfilled topic for {filled} existing session(s).") |
| 608 | |
| 609 | new_files = 0 |
| 610 | updated_files = 0 |
| 611 | skipped_files = 0 |
| 612 | total_turns = 0 |
| 613 | total_sessions = set() |
| 614 | |
| 615 | for filepath in jsonl_files: |
| 616 | try: |
| 617 | mtime = os.path.getmtime(filepath) |
| 618 | except OSError: |
| 619 | continue |
| 620 | |
| 621 | row = conn.execute( |
| 622 | "SELECT mtime, lines FROM processed_files WHERE path = ?", |
| 623 | (filepath,) |
| 624 | ).fetchone() |
| 625 | |
| 626 | if row and abs(row["mtime"] - mtime) < 0.01: |
| 627 | skipped_files += 1 |
| 628 | continue |
| 629 | |
| 630 | is_new = row is None |
| 631 | if verbose: |
| 632 | status = "NEW" if is_new else "UPD" |
| 633 | print(f" [{status}] {filepath}") |