拆分 Markdown 文档为段落
(markdown_text: str, file_name: str)
| 110 | |
| 111 | |
| 112 | def split_markdown(markdown_text: str, file_name: str) -> list[MarkdownBlock]: |
| 113 | """ |
| 114 | 拆分 Markdown 文档为段落 |
| 115 | """ |
| 116 | markdown_text = markdown_text.replace("\r\n", "\n").replace("\r", "\n") |
| 117 | |
| 118 | # 文档标题 |
| 119 | title = "" |
| 120 | |
| 121 | lines = markdown_text.split("\n") |
| 122 | # markdown 段落 |
| 123 | blocks: list[MarkdownBlock] = [] |
| 124 | |
| 125 | # 当前二级标题 |
| 126 | current_header = None |
| 127 | current_content: list[BlockContent] = [] |
| 128 | # 代码需要合并到一起,所以先收集 |
| 129 | current_code: list[str] = [] |
| 130 | # 是否在代码快中 |
| 131 | in_code_block = False |
| 132 | |
| 133 | # 文档元数据 |
| 134 | in_meta = False |
| 135 | |
| 136 | for line in lines: |
| 137 | # 处理文档元数据 |
| 138 | if line.startswith("---"): |
| 139 | in_meta = not in_meta |
| 140 | continue |
| 141 | |
| 142 | if in_meta and ":" in line: |
| 143 | key, value = line.split(":") |
| 144 | if key == "title": |
| 145 | title = value.strip() |
| 146 | continue |
| 147 | |
| 148 | # 这是版本说明,没什么用 |
| 149 | if line.startswith("> ") and "以上版本" in line: |
| 150 | continue |
| 151 | |
| 152 | if line.startswith(">"): |
| 153 | line = line.replace(">", "") |
| 154 | |
| 155 | if line.strip() == "": |
| 156 | continue |
| 157 | |
| 158 | header_match = re.match(r"^#+\s", line) |
| 159 | # 匹配到了标题 |
| 160 | if header_match: |
| 161 | # 如果之前有标题,那么这就是新的一段 |
| 162 | if current_header is not None: |
| 163 | # 至少要有内容或者代码块 |
| 164 | if len(current_content) > 0: |
| 165 | blocks.append(MarkdownBlock(file_name, title, |
| 166 | current_header, current_content)) |
| 167 | current_content = [] |
| 168 | current_code = [] |
| 169 | # 开启新段落解析 |
no test coverage detected