解析文本 :param text: 文本数据 :return: 解析后数据 {content:段落数据,keywords:[‘段落关键词’],parent_chain:['段落父级链路']}
(self, text: str)
| 410 | return level_content_list |
| 411 | |
| 412 | def parse(self, text: str): |
| 413 | """ |
| 414 | 解析文本 |
| 415 | :param text: 文本数据 |
| 416 | :return: 解析后数据 {content:段落数据,keywords:[‘段落关键词’],parent_chain:['段落父级链路']} |
| 417 | """ |
| 418 | text = text.replace('\r\n', '\n') |
| 419 | text = text.replace('\r', '\n') |
| 420 | text = text.replace("\0", '') |
| 421 | result_tree = self.parse_to_tree(text, 0) |
| 422 | result = result_tree_to_paragraph(result_tree, [], [], self.with_filter) |
| 423 | for e in result: |
| 424 | if len(e['content']) > 4096: |
| 425 | pass |
| 426 | title_list = list(set([row.get('title') for row in result])) |
| 427 | return [item for item in [self.post_reset_paragraph(row, title_list) for row in result] if |
| 428 | 'content' in item and len(item.get('content').strip()) > 0] |
| 429 | |
| 430 | def post_reset_paragraph(self, paragraph: Dict, title_list: List[str]): |
| 431 | result = self.content_is_null(paragraph, title_list) |
no test coverage detected