Method split

langroid/parsing/parser.py:402–425 · view source on GitHub ↗

(self, docs: List[Document])

Source from the content-addressed store, hash-verified

400	return chunks
401
402	def split(self, docs: List[Document]) -> List[Document]:
403	if len(docs) == 0:
404	return []
405	# create ids in metadata of docs if absent:
406	# we need this to distinguish docs later in add_window_ids
407	for d in docs:
408	if d.metadata.id in [None, ""]:
409	d.metadata.id = ObjectRegistry.new_id()
410	# some docs are already splits, so don't split them further!
411	chunked_docs = [d for d in docs if d.metadata.is_chunk]
412	big_docs = [d for d in docs if not d.metadata.is_chunk]
413	if len(big_docs) == 0:
414	return chunked_docs
415	match self.config.splitter:
416	case Splitter.MARKDOWN \| Splitter.TOKENS:
417	big_doc_chunks = self.split_chunk_tokens(big_docs)
418	case Splitter.PARA_SENTENCE:
419	big_doc_chunks = self.split_para_sentence(big_docs)
420	case Splitter.SIMPLE:
421	big_doc_chunks = self.split_simple(big_docs)
422	case _:
423	raise ValueError(f"Unknown splitter: {self.config.splitter}")
424
425	return chunked_docs + big_doc_chunks

test_vector_stores_context_windowFunction · 0.95

test_vector_stores_overlapping_matchesFunction · 0.95

test_parserFunction · 0.95

_format_entryMethod · 0.45

pydantic_major_versionFunction · 0.45

get_field_namesFunction · 0.45

get_valueFunction · 0.45

extract_fieldsFunction · 0.45

nested_dict_from_flatFunction · 0.45

format_footnote_textFunction · 0.45

shorten_textFunction · 0.45

print_long_textFunction · 0.45

split_chunk_tokensMethod · 0.95

split_para_sentenceMethod · 0.95

split_simpleMethod · 0.95

new_idMethod · 0.80

test_vector_stores_context_windowFunction · 0.76

test_vector_stores_overlapping_matchesFunction · 0.76

test_parserFunction · 0.76

test_llm_portkeyFunction · 0.36

test_repo_loaderFunction · 0.36

test_doc_chat_agent_llmFunction · 0.36

test_doc_chat_agent_llm_asyncFunction · 0.36

test_doc_chat_agent_taskFunction · 0.36

test_retrieval_toolFunction · 0.36

test_add_enrichmentsFunction · 0.36

test_write_file_toolFunction · 0.36

test_write_file_tool_multiple_filesFunction · 0.36