Given a collection of windows, where each window is a sequence of ids, identify groups of overlapping windows, and for each overlapping group, order the chunk-ids using topological sort so they appear in the original order in the text. Args: wind
(windows: List[List[str]])
| 345 | |
| 346 | @staticmethod |
| 347 | def remove_overlaps(windows: List[List[str]]) -> List[List[str]]: |
| 348 | """ |
| 349 | Given a collection of windows, where each window is a sequence of ids, |
| 350 | identify groups of overlapping windows, and for each overlapping group, |
| 351 | order the chunk-ids using topological sort so they appear in the original |
| 352 | order in the text. |
| 353 | |
| 354 | Args: |
| 355 | windows (List[int|str]): List of windows, where each window is a |
| 356 | sequence of ids. |
| 357 | |
| 358 | Returns: |
| 359 | List[int|str]: List of windows, where each window is a sequence of ids, |
| 360 | and no two windows overlap. |
| 361 | """ |
| 362 | ids = set(id for w in windows for id in w) |
| 363 | # id -> {win -> # pos} |
| 364 | id2win2pos: Dict[str, Dict[int, int]] = {id: {} for id in ids} |
| 365 | |
| 366 | for i, w in enumerate(windows): |
| 367 | for j, id in enumerate(w): |
| 368 | id2win2pos[id][i] = j |
| 369 | |
| 370 | n = len(windows) |
| 371 | # relation between windows: |
| 372 | order = np.zeros((n, n), dtype=np.int8) |
| 373 | for i, w in enumerate(windows): |
| 374 | for j, x in enumerate(windows): |
| 375 | if i == j: |
| 376 | continue |
| 377 | if len(set(w).intersection(x)) == 0: |
| 378 | continue |
| 379 | id = list(set(w).intersection(x))[0] # any common id |
| 380 | if id2win2pos[id][i] > id2win2pos[id][j]: |
| 381 | order[i, j] = -1 # win i is before win j |
| 382 | else: |
| 383 | order[i, j] = 1 # win i is after win j |
| 384 | |
| 385 | # find groups of windows that overlap, like connected components in a graph |
| 386 | groups = components(np.abs(order)) |
| 387 | |
| 388 | # order the chunk-ids in each group using topological sort |
| 389 | new_windows = [] |
| 390 | for g in groups: |
| 391 | # find total ordering among windows in group based on order matrix |
| 392 | # (this is a topological sort) |
| 393 | _g = np.array(g) |
| 394 | order_matrix = order[_g][:, _g] |
| 395 | ordered_window_indices = topological_sort(order_matrix) |
| 396 | ordered_window_ids = [windows[i] for i in _g[ordered_window_indices]] |
| 397 | flattened = [id for w in ordered_window_ids for id in w] |
| 398 | flattened_deduped = list(dict.fromkeys(flattened)) |
| 399 | # Note we are not going to split these, and instead we'll return |
| 400 | # larger windows from concatenating the connected groups. |
| 401 | # This ensures context is retained for LLM q/a |
| 402 | new_windows += [flattened_deduped] |
| 403 | |
| 404 | return new_windows |
no test coverage detected