(
collection: Collection,
type: RecordType,
title: str,
text_splitter: TextSplitter,
max_num_chunks: int,
content: Optional[str] = None,
file_id: Optional[str] = None,
url: Optional[str] = None,
)
| 15 | |
| 16 | |
| 17 | async def process_content( |
| 18 | collection: Collection, |
| 19 | type: RecordType, |
| 20 | title: str, |
| 21 | text_splitter: TextSplitter, |
| 22 | max_num_chunks: int, |
| 23 | content: Optional[str] = None, |
| 24 | file_id: Optional[str] = None, |
| 25 | url: Optional[str] = None, |
| 26 | ): |
| 27 | from app.services.retrieval.embedding import embed_documents |
| 28 | |
| 29 | # split content into chunks |
| 30 | db_content = await load_db_content( |
| 31 | record_type=type, |
| 32 | content=content, |
| 33 | file_id=file_id, |
| 34 | url=url, |
| 35 | ) |
| 36 | |
| 37 | content_to_split = await load_content_to_split( |
| 38 | record_type=type, |
| 39 | content=content, |
| 40 | file_id=file_id, |
| 41 | url=url, |
| 42 | ) |
| 43 | |
| 44 | # embed the documents |
| 45 | chunk_text_list, num_tokens_list = text_splitter.split_text(text=content_to_split, title=title) |
| 46 | if len(chunk_text_list) > max_num_chunks: |
| 47 | raise_http_error( |
| 48 | ErrorCode.RESOURCE_LIMIT_REACHED, |
| 49 | "The collection has no enough capacity to store the new chunks created from the record content.", |
| 50 | ) |
| 51 | |
| 52 | # validate model |
| 53 | embedding_model = await model_ops.get(model_id=collection.embedding_model_id) |
| 54 | |
| 55 | # embed the documents |
| 56 | embeddings = await embed_documents( |
| 57 | documents=chunk_text_list, |
| 58 | embedding_model=embedding_model, |
| 59 | embedding_size=collection.embedding_size, |
| 60 | ) |
| 61 | |
| 62 | return chunk_text_list, num_tokens_list, embeddings, db_content |
| 63 | |
| 64 | |
| 65 | class RecordModelOperator(PostgresModelOperator): |
no test coverage detected