(self, with_valid=True, with_embedding=True)
| 604 | |
| 605 | @transaction.atomic |
| 606 | def sync(self, with_valid=True, with_embedding=True): |
| 607 | if with_valid: |
| 608 | self.is_valid(raise_exception=True) |
| 609 | document_id = self.data.get("document_id") |
| 610 | document = QuerySet(Document).filter(id=document_id).first() |
| 611 | state = State.SUCCESS |
| 612 | if document.type != KnowledgeType.WEB: |
| 613 | return True |
| 614 | try: |
| 615 | ListenerManagement.update_status( |
| 616 | QuerySet(Document).filter(id=document_id), TaskType.SYNC, State.PENDING |
| 617 | ) |
| 618 | ListenerManagement.get_aggregation_document_status(document_id)() |
| 619 | source_url = document.meta.get("source_url") |
| 620 | selector_list = ( |
| 621 | document.meta.get("selector").split(" ") |
| 622 | if "selector" in document.meta and document.meta.get("selector") is not None |
| 623 | else [] |
| 624 | ) |
| 625 | result = Fork(source_url, selector_list).fork() |
| 626 | if result.status == 200: |
| 627 | # 删除段落 |
| 628 | QuerySet(model=Paragraph).filter(document_id=document_id).delete() |
| 629 | # 删除问题 |
| 630 | QuerySet(model=ProblemParagraphMapping).filter(document_id=document_id).delete() |
| 631 | delete_problems_and_mappings([document_id]) |
| 632 | # 删除向量库 |
| 633 | delete_embedding_by_document(document_id) |
| 634 | paragraphs = get_split_model("web.md").parse(result.content) |
| 635 | char_length = reduce(lambda x, y: x + y, [len(p.get("content")) for p in paragraphs], 0) |
| 636 | QuerySet(Document).filter(id=document_id).update(char_length=char_length) |
| 637 | document_paragraph_model = DocumentSerializers.Create.get_paragraph_model(document, paragraphs) |
| 638 | |
| 639 | paragraph_model_list = document_paragraph_model.get("paragraph_model_list") |
| 640 | problem_paragraph_object_list = document_paragraph_model.get("problem_paragraph_object_list") |
| 641 | problem_model_list, problem_paragraph_mapping_list = ProblemParagraphManage( |
| 642 | problem_paragraph_object_list, document.knowledge_id |
| 643 | ).to_problem_model_list() |
| 644 | # 批量插入段落 |
| 645 | if len(paragraph_model_list) > 0: |
| 646 | max_position = ( |
| 647 | Paragraph.objects.filter(document_id=document_id).aggregate(max_position=Max("position"))[ |
| 648 | "max_position" |
| 649 | ] |
| 650 | or 0 |
| 651 | ) |
| 652 | for i, paragraph in enumerate(paragraph_model_list): |
| 653 | paragraph.position = max_position + i + 1 |
| 654 | QuerySet(Paragraph).bulk_create(paragraph_model_list) |
| 655 | # 批量插入问题 |
| 656 | QuerySet(Problem).bulk_create(problem_model_list) if len(problem_model_list) > 0 else None |
| 657 | # 插入关联问题 |
| 658 | QuerySet(ProblemParagraphMapping).bulk_create(problem_paragraph_mapping_list) if len( |
| 659 | problem_paragraph_mapping_list |
| 660 | ) > 0 else None |
| 661 | # 向量化 |
| 662 | if with_embedding: |
| 663 | embedding_model_id = get_embedding_model_id_by_knowledge_id(document.knowledge_id) |
no test coverage detected