Class TextChunker

sdk/python/feast/chunker.py:90–150 · view source on GitHub ↗

Default chunker for plain text. Chunks by word count.

Source from the content-addressed store, hash-verified

88
89
90	class TextChunker(BaseChunker):
91	"""Default chunker for plain text. Chunks by word count."""
92
93	def load_parse_and_chunk(
94	self,
95	source: Any,
96	source_id: str,
97	source_column: str,
98	source_type: Optional[str] = None,
99	) -> list[dict]:
100	# Load
101	text = self._load(source)
102
103	# Chunk by words
104	return self._chunk_by_words(text, source_id, source_column)
105
106	def _load(self, source: Any) -> str:
107	from pathlib import Path
108
109	if isinstance(source, Path) and source.exists():
110	return Path(source).read_text()
111	if isinstance(source, str):
112	if source.endswith(".txt") and Path(source).exists():
113	return Path(source).read_text()
114	return str(source)
115
116	def _chunk_by_words(
117	self, text: str, source_id: str, source_column: str
118	) -> list[dict]:
119	words = text.split()
120	chunks = []
121
122	step = self.config.chunk_size - self.config.chunk_overlap
123	if step <= 0:
124	raise ValueError(
125	f"chunk_overlap ({self.config.chunk_overlap}) must be less than "
126	f"chunk_size ({self.config.chunk_size})"
127	)
128	chunk_index = 0
129
130	for i in range(0, len(words), step):
131	chunk_words = words[i : i + self.config.chunk_size]
132
133	if len(chunk_words) < self.config.min_chunk_size:
134	continue
135
136	chunk_text = " ".join(chunk_words)
137	if self.config.max_chunk_chars:
138	chunk_text = chunk_text[: self.config.max_chunk_chars]
139
140	chunks.append(
141	{
142	"chunk_id": f"{source_id}_{chunk_index}",
143	"original_id": source_id,
144	source_column: chunk_text,
145	"chunk_index": chunk_index,
146	}
147	)

__init__Method · 0.90

test_basic_chunkingMethod · 0.90

test_overlapMethod · 0.90

test_min_chunk_size_filters_small_trailingMethod · 0.90

test_max_chunk_chars_truncationMethod · 0.90

test_empty_text_returns_no_chunksMethod · 0.90

test_chunk_dataframeMethod · 0.90

test_end_to_end_pipelineFunction · 0.90

no outgoing calls

test_basic_chunkingMethod · 0.72

test_overlapMethod · 0.72

test_min_chunk_size_filters_small_trailingMethod · 0.72

test_max_chunk_chars_truncationMethod · 0.72

test_empty_text_returns_no_chunksMethod · 0.72

test_chunk_dataframeMethod · 0.72

test_end_to_end_pipelineFunction · 0.72