| 365 | |
| 366 | |
| 367 | class SplitModel: |
| 368 | |
| 369 | def __init__(self, content_level_pattern, with_filter=True, limit=100000): |
| 370 | self.content_level_pattern = content_level_pattern |
| 371 | self.with_filter = with_filter |
| 372 | if type(limit) is not int: |
| 373 | limit = int(limit) |
| 374 | if limit is None or limit > 100000: |
| 375 | limit = 100000 |
| 376 | if limit < 50: |
| 377 | limit = 50 |
| 378 | self.limit = limit |
| 379 | |
| 380 | def parse_to_tree(self, text: str, index=0): |
| 381 | """ |
| 382 | 解析文本 |
| 383 | :param text: 需要解析的文本 |
| 384 | :param index: 从那个正则开始解析 |
| 385 | :return: 解析后的树形结果数据 |
| 386 | """ |
| 387 | level_content_list = parse_title_level(text, self.content_level_pattern, index) |
| 388 | if len(level_content_list) == 0: |
| 389 | return [to_tree_obj(row, 'block') for row in smart_split_paragraph(text, limit=self.limit)] |
| 390 | if index == 0 and text.lstrip().index(level_content_list[0]["content"].lstrip()) != 0: |
| 391 | level_content_list.insert(0, to_tree_obj("")) |
| 392 | |
| 393 | cursor = 0 |
| 394 | level_title_content_list = [item for item in level_content_list if item.get('state') == 'title'] |
| 395 | for i in range(len(level_title_content_list)): |
| 396 | start_content: str = level_title_content_list[i].get('content') |
| 397 | if cursor < text.index(start_content, cursor): |
| 398 | for row in smart_split_paragraph(text[cursor: text.index(start_content, cursor)], limit=self.limit): |
| 399 | level_content_list.insert(0, to_tree_obj(row, 'block')) |
| 400 | |
| 401 | block, cursor = get_level_block(text, level_title_content_list, i, cursor) |
| 402 | if len(block) == 0: |
| 403 | continue |
| 404 | children = self.parse_to_tree(text=block, index=index + 1) |
| 405 | level_title_content_list[i]['children'] = children |
| 406 | first_child_idx_in_block = block.lstrip().index(children[0]["content"].lstrip()) |
| 407 | if first_child_idx_in_block != 0: |
| 408 | inner_children = self.parse_to_tree(block[:first_child_idx_in_block], index + 1) |
| 409 | level_title_content_list[i]['children'].extend(inner_children) |
| 410 | return level_content_list |
| 411 | |
| 412 | def parse(self, text: str): |
| 413 | """ |
| 414 | 解析文本 |
| 415 | :param text: 文本数据 |
| 416 | :return: 解析后数据 {content:段落数据,keywords:[‘段落关键词’],parent_chain:['段落父级链路']} |
| 417 | """ |
| 418 | text = text.replace('\r\n', '\n') |
| 419 | text = text.replace('\r', '\n') |
| 420 | text = text.replace("\0", '') |
| 421 | result_tree = self.parse_to_tree(text, 0) |
| 422 | result = result_tree_to_paragraph(result_tree, [], [], self.with_filter) |
| 423 | for e in result: |
| 424 | if len(e['content']) > 4096: |
no outgoing calls
no test coverage detected