Chunk all documents in a DataFrame. Args: df: The DataFrame containing the documents to chunk. id_column: The column containing the document IDs. source_column: The column containing the document sources. type_column: The column conta
(
self,
df: pd.DataFrame,
id_column: str,
source_column: str,
type_column: Optional[str] = None,
)
| 54 | pass |
| 55 | |
| 56 | def chunk_dataframe( |
| 57 | self, |
| 58 | df: pd.DataFrame, |
| 59 | id_column: str, |
| 60 | source_column: str, |
| 61 | type_column: Optional[str] = None, |
| 62 | ) -> pd.DataFrame: |
| 63 | """ |
| 64 | Chunk all documents in a DataFrame. |
| 65 | |
| 66 | Args: |
| 67 | df: The DataFrame containing the documents to chunk. |
| 68 | id_column: The column containing the document IDs. |
| 69 | source_column: The column containing the document sources. |
| 70 | type_column: The column containing the document types. |
| 71 | """ |
| 72 | |
| 73 | all_chunks = [] |
| 74 | for row in df.itertuples(index=False): |
| 75 | chunks = self.load_parse_and_chunk( |
| 76 | getattr(row, source_column), |
| 77 | str(getattr(row, id_column)), |
| 78 | source_column, |
| 79 | getattr(row, type_column) if type_column else None, |
| 80 | ) |
| 81 | all_chunks.extend(chunks) |
| 82 | |
| 83 | if not all_chunks: |
| 84 | return pd.DataFrame( |
| 85 | columns=["chunk_id", "original_id", source_column, "chunk_index"] |
| 86 | ) |
| 87 | return pd.DataFrame(all_chunks) |
| 88 | |
| 89 | |
| 90 | class TextChunker(BaseChunker): |