| 141 | return blocks |
| 142 | |
| 143 | def _merge(self, documents, chunk_token_threshold, overlap): |
| 144 | chunks = [] |
| 145 | sections = [] |
| 146 | total_tokens = 0 |
| 147 | |
| 148 | # Calculate the total tokens across all documents |
| 149 | for document in documents: |
| 150 | total_tokens += len(document.split(' ')) * self.word_token_rate |
| 151 | |
| 152 | # Calculate the number of sections needed |
| 153 | num_sections = math.floor(total_tokens / chunk_token_threshold) |
| 154 | if num_sections < 1: |
| 155 | num_sections = 1 # Ensure there is at least one section |
| 156 | adjusted_chunk_threshold = total_tokens / num_sections |
| 157 | |
| 158 | total_token_so_far = 0 |
| 159 | current_chunk = [] |
| 160 | |
| 161 | for document in documents: |
| 162 | tokens = document.split(' ') |
| 163 | token_count = len(tokens) * self.word_token_rate |
| 164 | |
| 165 | if total_token_so_far + token_count <= adjusted_chunk_threshold: |
| 166 | current_chunk.extend(tokens) |
| 167 | total_token_so_far += token_count |
| 168 | else: |
| 169 | # Ensure to handle the last section properly |
| 170 | if len(sections) == num_sections - 1: |
| 171 | current_chunk.extend(tokens) |
| 172 | continue |
| 173 | |
| 174 | # Add overlap if specified |
| 175 | if overlap > 0 and current_chunk: |
| 176 | overlap_tokens = current_chunk[-overlap:] |
| 177 | current_chunk.extend(overlap_tokens) |
| 178 | |
| 179 | sections.append(' '.join(current_chunk)) |
| 180 | current_chunk = tokens |
| 181 | total_token_so_far = token_count |
| 182 | |
| 183 | # Add the last chunk |
| 184 | if current_chunk: |
| 185 | sections.append(' '.join(current_chunk)) |
| 186 | |
| 187 | return sections |
| 188 | |
| 189 | |
| 190 | def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: |