hub / github.com/VectifyAI/PageIndex / md_to_tree

Function md_to_tree

pageindex/page_index_md.py:243–300 · view source on GitHub ↗

(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes')

Source from the content-addressed store, hash-verified

241
242
243	async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
244	with open(md_path, 'r', encoding='utf-8') as f:
245	markdown_content = f.read()
246	line_count = markdown_content.count('\n') + 1
247
248	print(f"Extracting nodes from markdown...")
249	node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
250
251	print(f"Extracting text content from nodes...")
252	nodes_with_content = extract_node_text_content(node_list, markdown_lines)
253
254	if if_thinning:
255	nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
256	print(f"Thinning nodes...")
257	nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)
258
259	print(f"Building tree from nodes...")
260	tree_structure = build_tree_from_nodes(nodes_with_content)
261
262	if if_add_node_id == 'yes':
263	write_node_id(tree_structure)
264
265	print(f"Formatting tree structure...")
266
267	if if_add_node_summary == 'yes':
268	# Always include text for summary generation
269	tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
270
271	print(f"Generating summaries for each node...")
272	tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
273
274	if if_add_node_text == 'no':
275	# Remove text after summary generation if not requested
276	tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
277
278	if if_add_doc_description == 'yes':
279	print(f"Generating document description...")
280	# Create a clean structure without unnecessary fields for description generation
281	clean_structure = create_clean_structure_for_description(tree_structure)
282	doc_description = generate_doc_description(clean_structure, model=model)
283	return {
284	'doc_name': os.path.splitext(os.path.basename(md_path))[0],
285	'doc_description': doc_description,
286	'line_count': line_count,
287	'structure': tree_structure,
288	}
289	else:
290	# No summaries needed, format based on text preference
291	if if_add_node_text == 'yes':
292	tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
293	else:
294	tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
295
296	return {
297	'doc_name': os.path.splitext(os.path.basename(md_path))[0],
298	'line_count': line_count,
299	'structure': tree_structure,
300	}

Callers 3

run_pageindex.pyFile · 0.90

indexMethod · 0.85

page_index_md.pyFile · 0.85

Calls 10

extract_nodes_from_markdownFunction · 0.85

extract_node_text_contentFunction · 0.85

update_node_list_with_text_token_countFunction · 0.85

tree_thinning_for_indexFunction · 0.85

build_tree_from_nodesFunction · 0.85

write_node_idFunction · 0.85

format_structureFunction · 0.85

generate_summaries_for_structure_mdFunction · 0.85

create_clean_structure_for_descriptionFunction · 0.85

generate_doc_descriptionFunction · 0.85

Tested by

no test coverage detected