(
cls_name: str, load_cls: Type[P], cls_extensions: List[FileExtension | str]
)
| 39 | # dynamically creates Processor classes. Maybe redo this for finer control over instanciation |
| 40 | # processor classes are opaque as we don't know what params they would have -> not easy to have lsp completion |
| 41 | def _build_processor( |
| 42 | cls_name: str, load_cls: Type[P], cls_extensions: List[FileExtension | str] |
| 43 | ) -> Type[ProcessorInit]: |
| 44 | enc = tiktoken.get_encoding("cl100k_base") |
| 45 | |
| 46 | class _Processor(ProcessorBase): |
| 47 | supported_extensions = cls_extensions |
| 48 | |
| 49 | def __init__( |
| 50 | self, |
| 51 | splitter: TextSplitter | None = None, |
| 52 | splitter_config: SplitterConfig = SplitterConfig(), |
| 53 | **loader_kwargs: dict[str, Any], |
| 54 | ) -> None: |
| 55 | self.loader_cls = load_cls |
| 56 | self.loader_kwargs = loader_kwargs |
| 57 | |
| 58 | self.splitter_config = splitter_config |
| 59 | |
| 60 | if splitter: |
| 61 | self.text_splitter = splitter |
| 62 | else: |
| 63 | self.text_splitter = ( |
| 64 | RecursiveCharacterTextSplitter.from_tiktoken_encoder( |
| 65 | chunk_size=splitter_config.chunk_size, |
| 66 | chunk_overlap=splitter_config.chunk_overlap, |
| 67 | ) |
| 68 | ) |
| 69 | |
| 70 | @property |
| 71 | def processor_metadata(self) -> dict[str, Any]: |
| 72 | return { |
| 73 | "processor_cls": self.loader_cls.__name__, |
| 74 | "splitter": self.splitter_config.model_dump(), |
| 75 | } |
| 76 | |
| 77 | async def process_file_inner(self, file: QuivrFile) -> list[Document]: |
| 78 | if hasattr(self.loader_cls, "__init__"): |
| 79 | # NOTE: mypy can't correctly type this as BaseLoader doesn't have a constructor method |
| 80 | loader = self.loader_cls(file_path=str(file.path), **self.loader_kwargs) # type: ignore |
| 81 | else: |
| 82 | loader = self.loader_cls() |
| 83 | |
| 84 | documents = await loader.aload() |
| 85 | docs = self.text_splitter.split_documents(documents) |
| 86 | |
| 87 | for doc in docs: |
| 88 | doc.metadata = {"chunk_size": len(enc.encode(doc.page_content))} |
| 89 | |
| 90 | return docs |
| 91 | |
| 92 | return type(cls_name, (ProcessorInit,), dict(_Processor.__dict__)) |
| 93 | |
| 94 | |
| 95 | CSVProcessor = _build_processor("CSVProcessor", CSVLoader, [FileExtension.csv]) |
no outgoing calls