MCPcopy Index your code
hub / github.com/1Panel-dev/MaxKB / sync

Method sync

apps/knowledge/serializers/document.py:606–681  ·  view source on GitHub ↗
(self, with_valid=True, with_embedding=True)

Source from the content-addressed store, hash-verified

604
605 @transaction.atomic
606 def sync(self, with_valid=True, with_embedding=True):
607 if with_valid:
608 self.is_valid(raise_exception=True)
609 document_id = self.data.get("document_id")
610 document = QuerySet(Document).filter(id=document_id).first()
611 state = State.SUCCESS
612 if document.type != KnowledgeType.WEB:
613 return True
614 try:
615 ListenerManagement.update_status(
616 QuerySet(Document).filter(id=document_id), TaskType.SYNC, State.PENDING
617 )
618 ListenerManagement.get_aggregation_document_status(document_id)()
619 source_url = document.meta.get("source_url")
620 selector_list = (
621 document.meta.get("selector").split(" ")
622 if "selector" in document.meta and document.meta.get("selector") is not None
623 else []
624 )
625 result = Fork(source_url, selector_list).fork()
626 if result.status == 200:
627 # 删除段落
628 QuerySet(model=Paragraph).filter(document_id=document_id).delete()
629 # 删除问题
630 QuerySet(model=ProblemParagraphMapping).filter(document_id=document_id).delete()
631 delete_problems_and_mappings([document_id])
632 # 删除向量库
633 delete_embedding_by_document(document_id)
634 paragraphs = get_split_model("web.md").parse(result.content)
635 char_length = reduce(lambda x, y: x + y, [len(p.get("content")) for p in paragraphs], 0)
636 QuerySet(Document).filter(id=document_id).update(char_length=char_length)
637 document_paragraph_model = DocumentSerializers.Create.get_paragraph_model(document, paragraphs)
638
639 paragraph_model_list = document_paragraph_model.get("paragraph_model_list")
640 problem_paragraph_object_list = document_paragraph_model.get("problem_paragraph_object_list")
641 problem_model_list, problem_paragraph_mapping_list = ProblemParagraphManage(
642 problem_paragraph_object_list, document.knowledge_id
643 ).to_problem_model_list()
644 # 批量插入段落
645 if len(paragraph_model_list) > 0:
646 max_position = (
647 Paragraph.objects.filter(document_id=document_id).aggregate(max_position=Max("position"))[
648 "max_position"
649 ]
650 or 0
651 )
652 for i, paragraph in enumerate(paragraph_model_list):
653 paragraph.position = max_position + i + 1
654 QuerySet(Paragraph).bulk_create(paragraph_model_list)
655 # 批量插入问题
656 QuerySet(Problem).bulk_create(problem_model_list) if len(problem_model_list) > 0 else None
657 # 插入关联问题
658 QuerySet(ProblemParagraphMapping).bulk_create(problem_paragraph_mapping_list) if len(
659 problem_paragraph_mapping_list
660 ) > 0 else None
661 # 向量化
662 if with_embedding:
663 embedding_model_id = get_embedding_model_id_by_knowledge_id(document.knowledge_id)

Callers 4

batch_syncMethod · 0.45
handlerFunction · 0.45
putMethod · 0.45
putMethod · 0.45

Calls 15

is_validMethod · 0.95
ForkClass · 0.90
get_split_modelFunction · 0.90
get_paragraph_modelMethod · 0.80
to_problem_model_listMethod · 0.80
getMethod · 0.45
update_statusMethod · 0.45

Tested by

no test coverage detected