获取用于生成嵌入的文本段落列表
(self)
| 76 | return output |
| 77 | |
| 78 | def get_text_blocks(self) -> list[str]: |
| 79 | """获取用于生成嵌入的文本段落列表""" |
| 80 | blocks: list[str] = [] |
| 81 | header = self.header.replace("#", "") if self.header else "" |
| 82 | if header != "": |
| 83 | if len(header) < 4: |
| 84 | blocks.append(self.title + header) |
| 85 | else: |
| 86 | blocks.append(header) |
| 87 | all_text = "" |
| 88 | for para in self.content: |
| 89 | if para.type == ContentType.Text: |
| 90 | # 去掉各种样式及图片避免影响 |
| 91 | text = unmark(para.text) |
| 92 | all_text += text |
| 93 | blocks.append(self.title + header + text) |
| 94 | blocks.append(text) |
| 95 | # 对于太长的段落,拆分一下 |
| 96 | if len(text) > LONG_CONTENT_LENGTH: |
| 97 | for line in text.split(","): |
| 98 | blocks.append(line) |
| 99 | |
| 100 | if len(all_text) < LONG_CONTENT_LENGTH: |
| 101 | blocks.append(header + all_text) |
| 102 | |
| 103 | # 删掉重复的和避免空字符 |
| 104 | output_blocks = set() |
| 105 | for block in blocks: |
| 106 | block = block.strip() |
| 107 | if block != "" and block not in output_blocks: |
| 108 | output_blocks.add(block) |
| 109 | return list(output_blocks) |
| 110 | |
| 111 | |
| 112 | def split_markdown(markdown_text: str, file_name: str) -> list[MarkdownBlock]: |
no test coverage detected