hub / github.com/1Panel-dev/MaxKB / sync

Method sync

apps/knowledge/serializers/document.py:606–681 · view source on GitHub ↗

(self, with_valid=True, with_embedding=True)

Source from the content-addressed store, hash-verified

604
605	@transaction.atomic
606	def sync(self, with_valid=True, with_embedding=True):
607	if with_valid:
608	self.is_valid(raise_exception=True)
609	document_id = self.data.get("document_id")
610	document = QuerySet(Document).filter(id=document_id).first()
611	state = State.SUCCESS
612	if document.type != KnowledgeType.WEB:
613	return True
614	try:
615	ListenerManagement.update_status(
616	QuerySet(Document).filter(id=document_id), TaskType.SYNC, State.PENDING
617	)
618	ListenerManagement.get_aggregation_document_status(document_id)()
619	source_url = document.meta.get("source_url")
620	selector_list = (
621	document.meta.get("selector").split(" ")
622	if "selector" in document.meta and document.meta.get("selector") is not None
623	else []
624	)
625	result = Fork(source_url, selector_list).fork()
626	if result.status == 200:
627	# 删除段落
628	QuerySet(model=Paragraph).filter(document_id=document_id).delete()
629	# 删除问题
630	QuerySet(model=ProblemParagraphMapping).filter(document_id=document_id).delete()
631	delete_problems_and_mappings([document_id])
632	# 删除向量库
633	delete_embedding_by_document(document_id)
634	paragraphs = get_split_model("web.md").parse(result.content)
635	char_length = reduce(lambda x, y: x + y, [len(p.get("content")) for p in paragraphs], 0)
636	QuerySet(Document).filter(id=document_id).update(char_length=char_length)
637	document_paragraph_model = DocumentSerializers.Create.get_paragraph_model(document, paragraphs)
638
639	paragraph_model_list = document_paragraph_model.get("paragraph_model_list")
640	problem_paragraph_object_list = document_paragraph_model.get("problem_paragraph_object_list")
641	problem_model_list, problem_paragraph_mapping_list = ProblemParagraphManage(
642	problem_paragraph_object_list, document.knowledge_id
643	).to_problem_model_list()
644	# 批量插入段落
645	if len(paragraph_model_list) > 0:
646	max_position = (
647	Paragraph.objects.filter(document_id=document_id).aggregate(max_position=Max("position"))[
648	"max_position"
649	]
650	or 0
651	)
652	for i, paragraph in enumerate(paragraph_model_list):
653	paragraph.position = max_position + i + 1
654	QuerySet(Paragraph).bulk_create(paragraph_model_list)
655	# 批量插入问题
656	QuerySet(Problem).bulk_create(problem_model_list) if len(problem_model_list) > 0 else None
657	# 插入关联问题
658	QuerySet(ProblemParagraphMapping).bulk_create(problem_paragraph_mapping_list) if len(
659	problem_paragraph_mapping_list
660	) > 0 else None
661	# 向量化
662	if with_embedding:
663	embedding_model_id = get_embedding_model_id_by_knowledge_id(document.knowledge_id)

Callers 4

batch_syncMethod · 0.45

handlerFunction · 0.45

putMethod · 0.45

Calls 15

is_validMethod · 0.95

ForkClass · 0.90

delete_problems_and_mappingsFunction · 0.90

delete_embedding_by_documentFunction · 0.90

get_split_modelFunction · 0.90

ProblemParagraphManageClass · 0.90

get_embedding_model_id_by_knowledge_idFunction · 0.90

get_aggregation_document_statusMethod · 0.80

get_paragraph_modelMethod · 0.80

to_problem_model_listMethod · 0.80

getMethod · 0.45

update_statusMethod · 0.45

Tested by

no test coverage detected