Chunks may belong to multiple docs, but for each doc, they appear consecutively. Add window_ids in metadata
(self, chunks: List[Document])
| 166 | return self.tokenizer.decode(tokens[:max_tokens]) |
| 167 | |
| 168 | def add_window_ids(self, chunks: List[Document]) -> None: |
| 169 | """Chunks may belong to multiple docs, but for each doc, |
| 170 | they appear consecutively. Add window_ids in metadata""" |
| 171 | |
| 172 | # discard empty chunks |
| 173 | chunks = [c for c in chunks if c.content.strip() != ""] |
| 174 | if len(chunks) == 0: |
| 175 | return |
| 176 | # The original metadata.id (if any) is ignored since it will be same for all |
| 177 | # chunks and is useless. We want a distinct id for each chunk. |
| 178 | # ASSUMPTION: all chunks c of a doc have same c.metadata.id ! |
| 179 | orig_ids = [c.metadata.id for c in chunks] |
| 180 | ids = [ObjectRegistry.new_id() for c in chunks] |
| 181 | id2chunk = {id: c for id, c in zip(ids, chunks)} |
| 182 | |
| 183 | # group the ids by orig_id |
| 184 | # (each distinct orig_id refers to a different document) |
| 185 | orig_id_to_ids: Dict[str, List[str]] = {} |
| 186 | for orig_id, id in zip(orig_ids, ids): |
| 187 | if orig_id not in orig_id_to_ids: |
| 188 | orig_id_to_ids[orig_id] = [] |
| 189 | orig_id_to_ids[orig_id].append(id) |
| 190 | |
| 191 | # now each orig_id maps to a sequence of ids within a single doc |
| 192 | |
| 193 | k = self.config.n_neighbor_ids |
| 194 | for orig, ids in orig_id_to_ids.items(): |
| 195 | # ids are consecutive chunks in a single doc |
| 196 | n = len(ids) |
| 197 | window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)] |
| 198 | for i, _ in enumerate(ids): |
| 199 | c = id2chunk[ids[i]] |
| 200 | c.metadata.window_ids = window_ids[i] |
| 201 | c.metadata.id = ids[i] |
| 202 | c.metadata.is_chunk = True |
| 203 | |
| 204 | def split_simple(self, docs: List[Document]) -> List[Document]: |
| 205 | if len(self.config.separators) == 0: |
no test coverage detected