MCPcopy
hub / github.com/QuivrHQ/quivr / _build_processor

Function _build_processor

core/quivr_core/processor/implementations/default.py:41–92  ·  view source on GitHub ↗
(
    cls_name: str, load_cls: Type[P], cls_extensions: List[FileExtension | str]
)

Source from the content-addressed store, hash-verified

39# dynamically creates Processor classes. Maybe redo this for finer control over instanciation
40# processor classes are opaque as we don't know what params they would have -> not easy to have lsp completion
41def _build_processor(
42 cls_name: str, load_cls: Type[P], cls_extensions: List[FileExtension | str]
43) -> Type[ProcessorInit]:
44 enc = tiktoken.get_encoding("cl100k_base")
45
46 class _Processor(ProcessorBase):
47 supported_extensions = cls_extensions
48
49 def __init__(
50 self,
51 splitter: TextSplitter | None = None,
52 splitter_config: SplitterConfig = SplitterConfig(),
53 **loader_kwargs: dict[str, Any],
54 ) -> None:
55 self.loader_cls = load_cls
56 self.loader_kwargs = loader_kwargs
57
58 self.splitter_config = splitter_config
59
60 if splitter:
61 self.text_splitter = splitter
62 else:
63 self.text_splitter = (
64 RecursiveCharacterTextSplitter.from_tiktoken_encoder(
65 chunk_size=splitter_config.chunk_size,
66 chunk_overlap=splitter_config.chunk_overlap,
67 )
68 )
69
70 @property
71 def processor_metadata(self) -> dict[str, Any]:
72 return {
73 "processor_cls": self.loader_cls.__name__,
74 "splitter": self.splitter_config.model_dump(),
75 }
76
77 async def process_file_inner(self, file: QuivrFile) -> list[Document]:
78 if hasattr(self.loader_cls, "__init__"):
79 # NOTE: mypy can't correctly type this as BaseLoader doesn't have a constructor method
80 loader = self.loader_cls(file_path=str(file.path), **self.loader_kwargs) # type: ignore
81 else:
82 loader = self.loader_cls()
83
84 documents = await loader.aload()
85 docs = self.text_splitter.split_documents(documents)
86
87 for doc in docs:
88 doc.metadata = {"chunk_size": len(enc.encode(doc.page_content))}
89
90 return docs
91
92 return type(cls_name, (ProcessorInit,), dict(_Processor.__dict__))
93
94
95CSVProcessor = _build_processor("CSVProcessor", CSVLoader, [FileExtension.csv])

Callers 2

test___build_processorFunction · 0.90
default.pyFile · 0.85

Calls

no outgoing calls

Tested by 1

test___build_processorFunction · 0.72