MCPcopy Index your code
hub / github.com/feast-dev/feast / TextChunker

Class TextChunker

sdk/python/feast/chunker.py:90–150  ·  view source on GitHub ↗

Default chunker for plain text. Chunks by word count.

Source from the content-addressed store, hash-verified

88
89
90class TextChunker(BaseChunker):
91 """Default chunker for plain text. Chunks by word count."""
92
93 def load_parse_and_chunk(
94 self,
95 source: Any,
96 source_id: str,
97 source_column: str,
98 source_type: Optional[str] = None,
99 ) -> list[dict]:
100 # Load
101 text = self._load(source)
102
103 # Chunk by words
104 return self._chunk_by_words(text, source_id, source_column)
105
106 def _load(self, source: Any) -> str:
107 from pathlib import Path
108
109 if isinstance(source, Path) and source.exists():
110 return Path(source).read_text()
111 if isinstance(source, str):
112 if source.endswith(".txt") and Path(source).exists():
113 return Path(source).read_text()
114 return str(source)
115
116 def _chunk_by_words(
117 self, text: str, source_id: str, source_column: str
118 ) -> list[dict]:
119 words = text.split()
120 chunks = []
121
122 step = self.config.chunk_size - self.config.chunk_overlap
123 if step <= 0:
124 raise ValueError(
125 f"chunk_overlap ({self.config.chunk_overlap}) must be less than "
126 f"chunk_size ({self.config.chunk_size})"
127 )
128 chunk_index = 0
129
130 for i in range(0, len(words), step):
131 chunk_words = words[i : i + self.config.chunk_size]
132
133 if len(chunk_words) < self.config.min_chunk_size:
134 continue
135
136 chunk_text = " ".join(chunk_words)
137 if self.config.max_chunk_chars:
138 chunk_text = chunk_text[: self.config.max_chunk_chars]
139
140 chunks.append(
141 {
142 "chunk_id": f"{source_id}_{chunk_index}",
143 "original_id": source_id,
144 source_column: chunk_text,
145 "chunk_index": chunk_index,
146 }
147 )

Calls

no outgoing calls