hub / github.com/TaskingAI/TaskingAI / process_content

Function process_content

backend/app/operators/retrieval/record.py:17–62 · view source on GitHub ↗

(
    collection: Collection,
    type: RecordType,
    title: str,
    text_splitter: TextSplitter,
    max_num_chunks: int,
    content: Optional[str] = None,
    file_id: Optional[str] = None,
    url: Optional[str] = None,
)

Source from the content-addressed store, hash-verified

15
16
17	async def process_content(
18	collection: Collection,
19	type: RecordType,
20	title: str,
21	text_splitter: TextSplitter,
22	max_num_chunks: int,
23	content: Optional[str] = None,
24	file_id: Optional[str] = None,
25	url: Optional[str] = None,
26	):
27	from app.services.retrieval.embedding import embed_documents
28
29	# split content into chunks
30	db_content = await load_db_content(
31	record_type=type,
32	content=content,
33	file_id=file_id,
34	url=url,
35	)
36
37	content_to_split = await load_content_to_split(
38	record_type=type,
39	content=content,
40	file_id=file_id,
41	url=url,
42	)
43
44	# embed the documents
45	chunk_text_list, num_tokens_list = text_splitter.split_text(text=content_to_split, title=title)
46	if len(chunk_text_list) > max_num_chunks:
47	raise_http_error(
48	ErrorCode.RESOURCE_LIMIT_REACHED,
49	"The collection has no enough capacity to store the new chunks created from the record content.",
50	)
51
52	# validate model
53	embedding_model = await model_ops.get(model_id=collection.embedding_model_id)
54
55	# embed the documents
56	embeddings = await embed_documents(
57	documents=chunk_text_list,
58	embedding_model=embedding_model,
59	embedding_size=collection.embedding_size,
60	)
61
62	return chunk_text_list, num_tokens_list, embeddings, db_content
63
64
65	class RecordModelOperator(PostgresModelOperator):

Callers 2

createMethod · 0.85

updateMethod · 0.85

Calls 6

load_db_contentFunction · 0.90

load_content_to_splitFunction · 0.90

raise_http_errorFunction · 0.90

embed_documentsFunction · 0.90

split_textMethod · 0.80

getMethod · 0.45

Tested by

no test coverage detected