MCPcopy
hub / github.com/langroid/langroid / Parser

Class Parser

langroid/parsing/parser.py:148–425  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

146
147
148class Parser:
149 def __init__(self, config: ParsingConfig):
150 self.config = config
151 try:
152 self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
153 except Exception:
154 self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
155
156 def num_tokens(self, text: str) -> int:
157 if self.config.splitter == Splitter.MARKDOWN:
158 return count_words(text) # simple count based on whitespace-split
159 tokens = self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
160 return len(tokens)
161
162 def truncate_tokens(self, text: str, max_tokens: int) -> str:
163 tokens = self.tokenizer.encode(text)
164 if len(tokens) <= max_tokens:
165 return text
166 return self.tokenizer.decode(tokens[:max_tokens])
167
168 def add_window_ids(self, chunks: List[Document]) -> None:
169 """Chunks may belong to multiple docs, but for each doc,
170 they appear consecutively. Add window_ids in metadata"""
171
172 # discard empty chunks
173 chunks = [c for c in chunks if c.content.strip() != ""]
174 if len(chunks) == 0:
175 return
176 # The original metadata.id (if any) is ignored since it will be same for all
177 # chunks and is useless. We want a distinct id for each chunk.
178 # ASSUMPTION: all chunks c of a doc have same c.metadata.id !
179 orig_ids = [c.metadata.id for c in chunks]
180 ids = [ObjectRegistry.new_id() for c in chunks]
181 id2chunk = {id: c for id, c in zip(ids, chunks)}
182
183 # group the ids by orig_id
184 # (each distinct orig_id refers to a different document)
185 orig_id_to_ids: Dict[str, List[str]] = {}
186 for orig_id, id in zip(orig_ids, ids):
187 if orig_id not in orig_id_to_ids:
188 orig_id_to_ids[orig_id] = []
189 orig_id_to_ids[orig_id].append(id)
190
191 # now each orig_id maps to a sequence of ids within a single doc
192
193 k = self.config.n_neighbor_ids
194 for orig, ids in orig_id_to_ids.items():
195 # ids are consecutive chunks in a single doc
196 n = len(ids)
197 window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
198 for i, _ in enumerate(ids):
199 c = id2chunk[ids[i]]
200 c.metadata.window_ids = window_ids[i]
201 c.metadata.id = ids[i]
202 c.metadata.is_chunk = True
203
204 def split_simple(self, docs: List[Document]) -> List[Document]:
205 if len(self.config.separators) == 0:

Callers 11

get_documentsMethod · 0.90
__init__Method · 0.90
__init__Method · 0.90
ingest_doc_pathsMethod · 0.90
test_parserFunction · 0.90
test_text_token_chunkingFunction · 0.90
test_chunk_tokensFunction · 0.90

Calls

no outgoing calls

Tested by 6

test_parserFunction · 0.72
test_text_token_chunkingFunction · 0.72
test_chunk_tokensFunction · 0.72

Used in the wild real call sites across dependent graphs

searching dependent graphs…