MCPcopy Index your code
hub / github.com/baidu/amis / get_text_blocks

Method get_text_blocks

scripts/bot/split_markdown.py:78–109  ·  view source on GitHub ↗

获取用于生成嵌入的文本段落列表

(self)

Source from the content-addressed store, hash-verified

76 return output
77
78 def get_text_blocks(self) -> list[str]:
79 """获取用于生成嵌入的文本段落列表"""
80 blocks: list[str] = []
81 header = self.header.replace("#", "") if self.header else ""
82 if header != "":
83 if len(header) < 4:
84 blocks.append(self.title + header)
85 else:
86 blocks.append(header)
87 all_text = ""
88 for para in self.content:
89 if para.type == ContentType.Text:
90 # 去掉各种样式及图片避免影响
91 text = unmark(para.text)
92 all_text += text
93 blocks.append(self.title + header + text)
94 blocks.append(text)
95 # 对于太长的段落,拆分一下
96 if len(text) > LONG_CONTENT_LENGTH:
97 for line in text.split(","):
98 blocks.append(line)
99
100 if len(all_text) < LONG_CONTENT_LENGTH:
101 blocks.append(header + all_text)
102
103 # 删掉重复的和避免空字符
104 output_blocks = set()
105 for block in blocks:
106 block = block.strip()
107 if block != "" and block not in output_blocks:
108 output_blocks.add(block)
109 return list(output_blocks)
110
111
112def split_markdown(markdown_text: str, file_name: str) -> list[MarkdownBlock]:

Callers 1

gen_embedding.pyFile · 0.80

Calls 4

unmarkFunction · 0.85
setFunction · 0.85
replaceMethod · 0.80
addMethod · 0.45

Tested by

no test coverage detected