Embed a list of documents and chunk them into a format expected by the FeatureView schema using a Logic Implementation By the user and save the DataFrame to the online store. Args: documents: DataFrame containing the documents to embed. id_column: Column nam
(
self,
documents: pd.DataFrame,
id_column: str,
source_column: str,
type_column: Optional[str] = None,
column_mapping: Optional[tuple[str, str]] = None,
custom_schema_transform_fn: Optional[
Callable[[pd.DataFrame], pd.DataFrame]
] = None,
)
| 281 | os.chdir(original_cwd) |
| 282 | |
| 283 | def embed_documents( |
| 284 | self, |
| 285 | documents: pd.DataFrame, |
| 286 | id_column: str, |
| 287 | source_column: str, |
| 288 | type_column: Optional[str] = None, |
| 289 | column_mapping: Optional[tuple[str, str]] = None, |
| 290 | custom_schema_transform_fn: Optional[ |
| 291 | Callable[[pd.DataFrame], pd.DataFrame] |
| 292 | ] = None, |
| 293 | ) -> pd.DataFrame: |
| 294 | """ |
| 295 | Embed a list of documents and chunk them into a format expected by the FeatureView schema using a Logic Implementation By the user and save the DataFrame to the online store. |
| 296 | |
| 297 | Args: |
| 298 | documents: DataFrame containing the documents to embed. |
| 299 | id_column: Column name containing the document IDs. |
| 300 | source_column: Column name containing the document sources. |
| 301 | type_column: Column name containing the document types. |
| 302 | column_mapping: Tuple mapping source columns to (modality, output column). |
| 303 | custom_schema_transform_fn: Custom schema transform function to use for transforming the output of the chunker and embedder into the format expected by the FeatureView schema. |
| 304 | Returns: |
| 305 | DataFrame with the embedded documents. |
| 306 | |
| 307 | Example: |
| 308 | documents = pd.DataFrame({ |
| 309 | "id": [1, 2, 3], |
| 310 | "source": ["source1", "source2", "source3"], |
| 311 | "type": ["type1", "type2", "type3"], |
| 312 | "text": ["text1", "text2", "text3"], |
| 313 | }) |
| 314 | column_mapping = ("text", "text_embedding") |
| 315 | df = embed_documents(documents=documents, id_column="id", source_column="source", type_column="type", column_mapping=column_mapping) |
| 316 | |
| 317 | """ |
| 318 | if custom_schema_transform_fn is not None: |
| 319 | sig = inspect.signature(custom_schema_transform_fn) |
| 320 | params = list(sig.parameters.values()) |
| 321 | if ( |
| 322 | len(params) != 1 |
| 323 | or params[0].annotation != pd.DataFrame |
| 324 | or sig.return_annotation != pd.DataFrame |
| 325 | ): |
| 326 | raise ValueError( |
| 327 | "custom_schema_transform_fn must be a function that takes a DataFrame and returns a DataFrame" |
| 328 | ) |
| 329 | current_schema_transform_fn = ( |
| 330 | custom_schema_transform_fn |
| 331 | if custom_schema_transform_fn is not None |
| 332 | else self.schema_transform_fn |
| 333 | ) |
| 334 | |
| 335 | if column_mapping is None: |
| 336 | column_mapping = ("text", "text_embedding") |
| 337 | |
| 338 | if ( |
| 339 | current_schema_transform_fn is default_schema_transform_fn |
| 340 | and column_mapping[0] == "text" |