MCPcopy
hub / github.com/VectifyAI/PageIndex / md_to_tree

Function md_to_tree

pageindex/page_index_md.py:243–300  ·  view source on GitHub ↗
(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes')

Source from the content-addressed store, hash-verified

241
242
243async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
244 with open(md_path, 'r', encoding='utf-8') as f:
245 markdown_content = f.read()
246 line_count = markdown_content.count('\n') + 1
247
248 print(f"Extracting nodes from markdown...")
249 node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
250
251 print(f"Extracting text content from nodes...")
252 nodes_with_content = extract_node_text_content(node_list, markdown_lines)
253
254 if if_thinning:
255 nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
256 print(f"Thinning nodes...")
257 nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)
258
259 print(f"Building tree from nodes...")
260 tree_structure = build_tree_from_nodes(nodes_with_content)
261
262 if if_add_node_id == 'yes':
263 write_node_id(tree_structure)
264
265 print(f"Formatting tree structure...")
266
267 if if_add_node_summary == 'yes':
268 # Always include text for summary generation
269 tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
270
271 print(f"Generating summaries for each node...")
272 tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
273
274 if if_add_node_text == 'no':
275 # Remove text after summary generation if not requested
276 tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
277
278 if if_add_doc_description == 'yes':
279 print(f"Generating document description...")
280 # Create a clean structure without unnecessary fields for description generation
281 clean_structure = create_clean_structure_for_description(tree_structure)
282 doc_description = generate_doc_description(clean_structure, model=model)
283 return {
284 'doc_name': os.path.splitext(os.path.basename(md_path))[0],
285 'doc_description': doc_description,
286 'line_count': line_count,
287 'structure': tree_structure,
288 }
289 else:
290 # No summaries needed, format based on text preference
291 if if_add_node_text == 'yes':
292 tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
293 else:
294 tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
295
296 return {
297 'doc_name': os.path.splitext(os.path.basename(md_path))[0],
298 'line_count': line_count,
299 'structure': tree_structure,
300 }

Callers 3

run_pageindex.pyFile · 0.90
indexMethod · 0.85
page_index_md.pyFile · 0.85

Tested by

no test coverage detected