MCPcopy
hub / github.com/tirth8205/code-review-graph / _compute_summaries

Function _compute_summaries

code_review_graph/tools/build.py:133–358  ·  view source on GitHub ↗

Populate community_summaries, flow_snapshots, and risk_index tables. Uses batched aggregate queries and in-memory grouping instead of per-community/per-node loops. On graphs with ~100k edges this reduces the work from ``O(nodes + communities)`` SQLite round trips each doing their ow

(store: Any)

Source from the content-addressed store, hash-verified

131
132
133def _compute_summaries(store: Any) -> None:
134 """Populate community_summaries, flow_snapshots, and risk_index tables.
135
136 Uses batched aggregate queries and in-memory grouping instead of
137 per-community/per-node loops. On graphs with ~100k edges this
138 reduces the work from ``O(nodes + communities)`` SQLite round trips
139 each doing their own B-tree scan to a handful of ``GROUP BY``
140 queries, turning what used to be an effective hang into a few
141 seconds.
142
143 Each summary block (community_summaries, flow_snapshots, risk_index)
144 is wrapped in an explicit transaction so the DELETE + INSERT sequence
145 is atomic. If a table doesn't exist yet the block is silently skipped.
146 """
147 import json as _json
148 from collections import defaultdict
149 from os.path import commonprefix
150
151 conn = store._conn
152
153 # -- community_summaries --
154 try:
155 conn.execute("BEGIN IMMEDIATE")
156 conn.execute("DELETE FROM community_summaries")
157
158 # Pre-compute per-qualified_name edge counts once. Previously
159 # this section ran a per-community triple-JOIN aggregate query
160 # (nodes LEFT JOIN edges LEFT JOIN edges), which on graphs with
161 # thousands of communities was the second-biggest hang.
162 edge_counts: dict[str, int] = defaultdict(int)
163 for row in conn.execute(
164 "SELECT source_qualified, COUNT(*) FROM edges GROUP BY source_qualified"
165 ):
166 edge_counts[row[0]] += row[1]
167 for row in conn.execute(
168 "SELECT target_qualified, COUNT(*) FROM edges GROUP BY target_qualified"
169 ):
170 edge_counts[row[0]] += row[1]
171
172 # Group non-File nodes per community for top-symbol selection.
173 nodes_by_comm: dict[int, list[tuple[str, int]]] = defaultdict(list)
174 for row in conn.execute(
175 "SELECT community_id, name, qualified_name FROM nodes "
176 "WHERE community_id IS NOT NULL AND kind != 'File'"
177 ):
178 cid, name, qn = row[0], row[1], row[2]
179 nodes_by_comm[cid].append((name, edge_counts.get(qn, 0)))
180
181 # Group distinct file paths per community (preserving first-seen
182 # order for stable output, same as DISTINCT in the old query).
183 files_by_comm: dict[int, list[str]] = defaultdict(list)
184 seen_files: dict[int, set[str]] = defaultdict(set)
185 for row in conn.execute(
186 "SELECT community_id, file_path FROM nodes WHERE community_id IS NOT NULL"
187 ):
188 cid, fp = row[0], row[1]
189 if fp not in seen_files[cid]:
190 seen_files[cid].add(fp)

Calls 3

getMethod · 0.80
commitMethod · 0.80
rollbackMethod · 0.80