Query ``DocumentStore`` for the list of input documents.
(
self, input_queries: pw.Table[InputsQuerySchema]
)
| 451 | |
| 452 | @pw.table_transformer |
| 453 | def inputs_query( |
| 454 | self, input_queries: pw.Table[InputsQuerySchema] |
| 455 | ) -> pw.Table[InputsResultSchema]: |
| 456 | """ |
| 457 | Query ``DocumentStore`` for the list of input documents. |
| 458 | """ |
| 459 | # TODO: compare this approach to first joining queries to dicuments, then filtering, |
| 460 | # then grouping to get each response. |
| 461 | # The "dumb" tuple approach has more work precomputed for an all inputs query |
| 462 | all_metas = self.progress_table.reduce( |
| 463 | metadatas=pw.reducers.tuple(pw.this.metadata), |
| 464 | is_parsed=pw.reducers.tuple(pw.this.is_parsed), |
| 465 | ) |
| 466 | |
| 467 | input_queries = self.merge_filters(input_queries) |
| 468 | |
| 469 | @pw.udf |
| 470 | def format_inputs( |
| 471 | metadatas: list[pw.Json] | None, |
| 472 | metadata_filter: str | None, |
| 473 | return_status: bool, |
| 474 | is_parsed: list[bool] | None, |
| 475 | ) -> list[pw.Json]: |
| 476 | metadatas = metadatas if metadatas is not None else [] |
| 477 | is_parsed = is_parsed if is_parsed is not None else [] |
| 478 | assert metadatas is not None |
| 479 | assert is_parsed is not None |
| 480 | |
| 481 | def remove_id(m): |
| 482 | metadata_dict = m.as_dict() |
| 483 | del metadata_dict["_file_id"] |
| 484 | return pw.Json(metadata_dict) |
| 485 | |
| 486 | metadatas = [remove_id(m) for m in metadatas] |
| 487 | if metadata_filter: |
| 488 | metadatas = [ |
| 489 | m |
| 490 | for m in metadatas |
| 491 | if jmespath.search( |
| 492 | metadata_filter, m.as_dict(), options=_knn_lsh._glob_options |
| 493 | ) |
| 494 | ] |
| 495 | |
| 496 | if return_status: |
| 497 | metadatas = [ |
| 498 | pw.Json( |
| 499 | { |
| 500 | "_indexing_status": ( |
| 501 | IndexingStatus.INDEXED |
| 502 | if status |
| 503 | else IndexingStatus.INGESTED |
| 504 | ), |
| 505 | **m.as_dict(), |
| 506 | } |
| 507 | ) |
| 508 | for (m, status) in zip(metadatas, is_parsed) |
| 509 | ] |
| 510 |