| 174 | |
| 175 | # Configuration for chunking |
| 176 | class MarkdownChunkConfig(BaseModel): |
| 177 | chunk_size: int = 200 # desired chunk size in tokens |
| 178 | overlap_tokens: int = 30 # number of tokens to overlap between chunks |
| 179 | variation_percent: float = 0.3 # allowed variation |
| 180 | rollup: bool = True # whether to roll up chunks |
| 181 | header_context_sep: str = HEADER_CONTEXT_SEP # separator for header context |
| 182 | |
| 183 | @field_validator("chunk_size", mode="before") |
| 184 | @classmethod |
| 185 | def convert_chunk_size_to_int(cls, v: Any) -> int: |
| 186 | """Convert chunk_size to int, maintaining backward compatibility |
| 187 | with Pydantic V1. |
| 188 | """ |
| 189 | if isinstance(v, float): |
| 190 | return int(v) |
| 191 | return int(v) |
| 192 | |
| 193 | |
| 194 | # A simple tokenizer that counts tokens as whitespace-separated words. |
no outgoing calls
searching dependent graphs…