MCPcopy Index your code
hub / github.com/1Panel-dev/MaxKB / SplitModel

Class SplitModel

apps/common/utils/split_model.py:367–462  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

365
366
367class SplitModel:
368
369 def __init__(self, content_level_pattern, with_filter=True, limit=100000):
370 self.content_level_pattern = content_level_pattern
371 self.with_filter = with_filter
372 if type(limit) is not int:
373 limit = int(limit)
374 if limit is None or limit > 100000:
375 limit = 100000
376 if limit < 50:
377 limit = 50
378 self.limit = limit
379
380 def parse_to_tree(self, text: str, index=0):
381 """
382 解析文本
383 :param text: 需要解析的文本
384 :param index: 从那个正则开始解析
385 :return: 解析后的树形结果数据
386 """
387 level_content_list = parse_title_level(text, self.content_level_pattern, index)
388 if len(level_content_list) == 0:
389 return [to_tree_obj(row, 'block') for row in smart_split_paragraph(text, limit=self.limit)]
390 if index == 0 and text.lstrip().index(level_content_list[0]["content"].lstrip()) != 0:
391 level_content_list.insert(0, to_tree_obj(""))
392
393 cursor = 0
394 level_title_content_list = [item for item in level_content_list if item.get('state') == 'title']
395 for i in range(len(level_title_content_list)):
396 start_content: str = level_title_content_list[i].get('content')
397 if cursor < text.index(start_content, cursor):
398 for row in smart_split_paragraph(text[cursor: text.index(start_content, cursor)], limit=self.limit):
399 level_content_list.insert(0, to_tree_obj(row, 'block'))
400
401 block, cursor = get_level_block(text, level_title_content_list, i, cursor)
402 if len(block) == 0:
403 continue
404 children = self.parse_to_tree(text=block, index=index + 1)
405 level_title_content_list[i]['children'] = children
406 first_child_idx_in_block = block.lstrip().index(children[0]["content"].lstrip())
407 if first_child_idx_in_block != 0:
408 inner_children = self.parse_to_tree(block[:first_child_idx_in_block], index + 1)
409 level_title_content_list[i]['children'].extend(inner_children)
410 return level_content_list
411
412 def parse(self, text: str):
413 """
414 解析文本
415 :param text: 文本数据
416 :return: 解析后数据 {content:段落数据,keywords:[‘段落关键词’],parent_chain:['段落父级链路']}
417 """
418 text = text.replace('\r\n', '\n')
419 text = text.replace('\r', '\n')
420 text = text.replace("\0", '')
421 result_tree = self.parse_to_tree(text, 0)
422 result = result_tree_to_paragraph(result_tree, [], [], self.with_filter)
423 for e in result:
424 if len(e['content']) > 4096:

Callers 6

handleMethod · 0.90
handle_linksMethod · 0.90
handleMethod · 0.90
handleMethod · 0.90
handleMethod · 0.90
get_split_modelFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected