Run evaluation benchmarks across repositories. Args: repos: List of repo config names to evaluate (None = all). benchmarks: List of benchmark names to run (None = all). output_dir: Directory for CSV output files. Returns: Dict mapping ``{repo}_{benchmark}``
(
repos: list[str] | None = None,
benchmarks: list[str] | None = None,
output_dir: str | Path | None = None,
)
| 131 | |
| 132 | |
| 133 | def run_eval( |
| 134 | repos: list[str] | None = None, |
| 135 | benchmarks: list[str] | None = None, |
| 136 | output_dir: str | Path | None = None, |
| 137 | ) -> dict[str, list[dict]]: |
| 138 | """Run evaluation benchmarks across repositories. |
| 139 | |
| 140 | Args: |
| 141 | repos: List of repo config names to evaluate (None = all). |
| 142 | benchmarks: List of benchmark names to run (None = all). |
| 143 | output_dir: Directory for CSV output files. |
| 144 | |
| 145 | Returns: |
| 146 | Dict mapping ``{repo}_{benchmark}`` to list of result dicts. |
| 147 | """ |
| 148 | output_dir = Path(output_dir) if output_dir else DEFAULT_OUTPUT |
| 149 | output_dir.mkdir(parents=True, exist_ok=True) |
| 150 | |
| 151 | if repos: |
| 152 | configs = [load_config(r) for r in repos] |
| 153 | else: |
| 154 | configs = load_all_configs() |
| 155 | |
| 156 | benchmark_names = benchmarks or list(BENCHMARK_REGISTRY.keys()) |
| 157 | all_results: dict[str, list[dict]] = {} |
| 158 | today = date.today().isoformat() |
| 159 | |
| 160 | for config in configs: |
| 161 | name = config["name"] |
| 162 | logger.info("Evaluating %s...", name) |
| 163 | |
| 164 | # Resolve the repo path to an absolute Path before handing it to |
| 165 | # full_build / get_db_path so the stored qualified_names match what |
| 166 | # the CLI/MCP layer produces (those paths go through _get_store -> |
| 167 | # _validate_repo_root which .resolve()s). Without this, a later |
| 168 | # ``code-review-graph update --repo <relative>`` writes the same |
| 169 | # function under a new absolute-prefixed qualified_name, leaving the |
| 170 | # graph with duplicate nodes for the same source location. |
| 171 | repo_path = clone_or_update(config).resolve() |
| 172 | |
| 173 | # Build graph |
| 174 | from code_review_graph.graph import GraphStore |
| 175 | from code_review_graph.incremental import full_build, get_db_path |
| 176 | from code_review_graph.postprocessing import run_post_processing |
| 177 | |
| 178 | db_path = get_db_path(repo_path) |
| 179 | store = GraphStore(db_path) |
| 180 | |
| 181 | full_build(repo_path, store) |
| 182 | # full_build is the parsing-only primitive; the higher-level CLI/MCP |
| 183 | # wrappers run postprocessing on top. The eval framework bypasses |
| 184 | # those, so call it directly here. Without this, FTS5 stays empty |
| 185 | # and downstream benchmarks (token_efficiency, search_quality) |
| 186 | # silently produce useless results. See: search.rebuild_fts_index. |
| 187 | pp_result = run_post_processing(store) |
| 188 | for warning in pp_result.get("warnings", []): |
| 189 | logger.warning(" postprocessing: %s", warning) |
| 190 |
no test coverage detected