Populate community_summaries, flow_snapshots, and risk_index tables. Uses batched aggregate queries and in-memory grouping instead of per-community/per-node loops. On graphs with ~100k edges this reduces the work from ``O(nodes + communities)`` SQLite round trips each doing their ow
(store: Any)
| 131 | |
| 132 | |
| 133 | def _compute_summaries(store: Any) -> None: |
| 134 | """Populate community_summaries, flow_snapshots, and risk_index tables. |
| 135 | |
| 136 | Uses batched aggregate queries and in-memory grouping instead of |
| 137 | per-community/per-node loops. On graphs with ~100k edges this |
| 138 | reduces the work from ``O(nodes + communities)`` SQLite round trips |
| 139 | each doing their own B-tree scan to a handful of ``GROUP BY`` |
| 140 | queries, turning what used to be an effective hang into a few |
| 141 | seconds. |
| 142 | |
| 143 | Each summary block (community_summaries, flow_snapshots, risk_index) |
| 144 | is wrapped in an explicit transaction so the DELETE + INSERT sequence |
| 145 | is atomic. If a table doesn't exist yet the block is silently skipped. |
| 146 | """ |
| 147 | import json as _json |
| 148 | from collections import defaultdict |
| 149 | from os.path import commonprefix |
| 150 | |
| 151 | conn = store._conn |
| 152 | |
| 153 | # -- community_summaries -- |
| 154 | try: |
| 155 | conn.execute("BEGIN IMMEDIATE") |
| 156 | conn.execute("DELETE FROM community_summaries") |
| 157 | |
| 158 | # Pre-compute per-qualified_name edge counts once. Previously |
| 159 | # this section ran a per-community triple-JOIN aggregate query |
| 160 | # (nodes LEFT JOIN edges LEFT JOIN edges), which on graphs with |
| 161 | # thousands of communities was the second-biggest hang. |
| 162 | edge_counts: dict[str, int] = defaultdict(int) |
| 163 | for row in conn.execute( |
| 164 | "SELECT source_qualified, COUNT(*) FROM edges GROUP BY source_qualified" |
| 165 | ): |
| 166 | edge_counts[row[0]] += row[1] |
| 167 | for row in conn.execute( |
| 168 | "SELECT target_qualified, COUNT(*) FROM edges GROUP BY target_qualified" |
| 169 | ): |
| 170 | edge_counts[row[0]] += row[1] |
| 171 | |
| 172 | # Group non-File nodes per community for top-symbol selection. |
| 173 | nodes_by_comm: dict[int, list[tuple[str, int]]] = defaultdict(list) |
| 174 | for row in conn.execute( |
| 175 | "SELECT community_id, name, qualified_name FROM nodes " |
| 176 | "WHERE community_id IS NOT NULL AND kind != 'File'" |
| 177 | ): |
| 178 | cid, name, qn = row[0], row[1], row[2] |
| 179 | nodes_by_comm[cid].append((name, edge_counts.get(qn, 0))) |
| 180 | |
| 181 | # Group distinct file paths per community (preserving first-seen |
| 182 | # order for stable output, same as DISTINCT in the old query). |
| 183 | files_by_comm: dict[int, list[str]] = defaultdict(list) |
| 184 | seen_files: dict[int, set[str]] = defaultdict(set) |
| 185 | for row in conn.execute( |
| 186 | "SELECT community_id, file_path FROM nodes WHERE community_id IS NOT NULL" |
| 187 | ): |
| 188 | cid, fp = row[0], row[1] |
| 189 | if fp not in seen_files[cid]: |
| 190 | seen_files[cid].add(fp) |