MCPcopy
hub / github.com/langroid/langroid / add_window_ids

Method add_window_ids

langroid/parsing/parser.py:168–202  ·  view source on GitHub ↗

Chunks may belong to multiple docs, but for each doc, they appear consecutively. Add window_ids in metadata

(self, chunks: List[Document])

Source from the content-addressed store, hash-verified

166 return self.tokenizer.decode(tokens[:max_tokens])
167
168 def add_window_ids(self, chunks: List[Document]) -> None:
169 """Chunks may belong to multiple docs, but for each doc,
170 they appear consecutively. Add window_ids in metadata"""
171
172 # discard empty chunks
173 chunks = [c for c in chunks if c.content.strip() != ""]
174 if len(chunks) == 0:
175 return
176 # The original metadata.id (if any) is ignored since it will be same for all
177 # chunks and is useless. We want a distinct id for each chunk.
178 # ASSUMPTION: all chunks c of a doc have same c.metadata.id !
179 orig_ids = [c.metadata.id for c in chunks]
180 ids = [ObjectRegistry.new_id() for c in chunks]
181 id2chunk = {id: c for id, c in zip(ids, chunks)}
182
183 # group the ids by orig_id
184 # (each distinct orig_id refers to a different document)
185 orig_id_to_ids: Dict[str, List[str]] = {}
186 for orig_id, id in zip(orig_ids, ids):
187 if orig_id not in orig_id_to_ids:
188 orig_id_to_ids[orig_id] = []
189 orig_id_to_ids[orig_id].append(id)
190
191 # now each orig_id maps to a sequence of ids within a single doc
192
193 k = self.config.n_neighbor_ids
194 for orig, ids in orig_id_to_ids.items():
195 # ids are consecutive chunks in a single doc
196 n = len(ids)
197 window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
198 for i, _ in enumerate(ids):
199 c = id2chunk[ids[i]]
200 c.metadata.window_ids = window_ids[i]
201 c.metadata.id = ids[i]
202 c.metadata.is_chunk = True
203
204 def split_simple(self, docs: List[Document]) -> List[Document]:
205 if len(self.config.separators) == 0:

Callers 5

split_simpleMethod · 0.95
split_para_sentenceMethod · 0.95
split_chunk_tokensMethod · 0.95
get_doc_chunksMethod · 0.80
ingest_docsMethod · 0.80

Calls 1

new_idMethod · 0.80

Tested by

no test coverage detected