Default chunker for plain text. Chunks by word count.
| 88 | |
| 89 | |
| 90 | class TextChunker(BaseChunker): |
| 91 | """Default chunker for plain text. Chunks by word count.""" |
| 92 | |
| 93 | def load_parse_and_chunk( |
| 94 | self, |
| 95 | source: Any, |
| 96 | source_id: str, |
| 97 | source_column: str, |
| 98 | source_type: Optional[str] = None, |
| 99 | ) -> list[dict]: |
| 100 | # Load |
| 101 | text = self._load(source) |
| 102 | |
| 103 | # Chunk by words |
| 104 | return self._chunk_by_words(text, source_id, source_column) |
| 105 | |
| 106 | def _load(self, source: Any) -> str: |
| 107 | from pathlib import Path |
| 108 | |
| 109 | if isinstance(source, Path) and source.exists(): |
| 110 | return Path(source).read_text() |
| 111 | if isinstance(source, str): |
| 112 | if source.endswith(".txt") and Path(source).exists(): |
| 113 | return Path(source).read_text() |
| 114 | return str(source) |
| 115 | |
| 116 | def _chunk_by_words( |
| 117 | self, text: str, source_id: str, source_column: str |
| 118 | ) -> list[dict]: |
| 119 | words = text.split() |
| 120 | chunks = [] |
| 121 | |
| 122 | step = self.config.chunk_size - self.config.chunk_overlap |
| 123 | if step <= 0: |
| 124 | raise ValueError( |
| 125 | f"chunk_overlap ({self.config.chunk_overlap}) must be less than " |
| 126 | f"chunk_size ({self.config.chunk_size})" |
| 127 | ) |
| 128 | chunk_index = 0 |
| 129 | |
| 130 | for i in range(0, len(words), step): |
| 131 | chunk_words = words[i : i + self.config.chunk_size] |
| 132 | |
| 133 | if len(chunk_words) < self.config.min_chunk_size: |
| 134 | continue |
| 135 | |
| 136 | chunk_text = " ".join(chunk_words) |
| 137 | if self.config.max_chunk_chars: |
| 138 | chunk_text = chunk_text[: self.config.max_chunk_chars] |
| 139 | |
| 140 | chunks.append( |
| 141 | { |
| 142 | "chunk_id": f"{source_id}_{chunk_index}", |
| 143 | "original_id": source_id, |
| 144 | source_column: chunk_text, |
| 145 | "chunk_index": chunk_index, |
| 146 | } |
| 147 | ) |
no outgoing calls