Function split_markdown

scripts/bot/split_markdown.py:112–198 · view source on GitHub ↗

拆分 Markdown 文档为段落

(markdown_text: str, file_name: str)

Source from the content-addressed store, hash-verified

110
111
112	def split_markdown(markdown_text: str, file_name: str) -> list[MarkdownBlock]:
113	"""
114	拆分 Markdown 文档为段落
115	"""
116	markdown_text = markdown_text.replace("\r\n", "\n").replace("\r", "\n")
117
118	# 文档标题
119	title = ""
120
121	lines = markdown_text.split("\n")
122	# markdown 段落
123	blocks: list[MarkdownBlock] = []
124
125	# 当前二级标题
126	current_header = None
127	current_content: list[BlockContent] = []
128	# 代码需要合并到一起，所以先收集
129	current_code: list[str] = []
130	# 是否在代码快中
131	in_code_block = False
132
133	# 文档元数据
134	in_meta = False
135
136	for line in lines:
137	# 处理文档元数据
138	if line.startswith("---"):
139	in_meta = not in_meta
140	continue
141
142	if in_meta and ":" in line:
143	key, value = line.split(":")
144	if key == "title":
145	title = value.strip()
146	continue
147
148	# 这是版本说明，没什么用
149	if line.startswith("> ") and "以上版本" in line:
150	continue
151
152	if line.startswith(">"):
153	line = line.replace(">", "")
154
155	if line.strip() == "":
156	continue
157
158	header_match = re.match(r"^#+\s", line)
159	# 匹配到了标题
160	if header_match:
161	# 如果之前有标题，那么这就是新的一段
162	if current_header is not None:
163	# 至少要有内容或者代码块
164	if len(current_content) > 0:
165	blocks.append(MarkdownBlock(file_name, title,
166	current_header, current_content))
167	current_content = []
168	current_code = []
169	# 开启新段落解析

gen_embedding.pyFile · 0.90

testFunction · 0.85

MarkdownBlockClass · 0.85

BlockContentClass · 0.85

replaceMethod · 0.80

matchMethod · 0.65

no test coverage detected