Function page_index_builder

pageindex/page_index.py:1082–1108 · view source on GitHub ↗

()

Source from the content-addressed store, hash-verified

1080	logger.info({'total_token': sum([page[1] for page in page_list])})
1081
1082	async def page_index_builder():
1083	structure = await tree_parser(page_list, opt, doc=doc, logger=logger)
1084	if opt.if_add_node_id == 'yes':
1085	write_node_id(structure)
1086	if opt.if_add_node_text == 'yes':
1087	add_node_text(structure, page_list)
1088	if opt.if_add_node_summary == 'yes':
1089	if opt.if_add_node_text == 'no':
1090	add_node_text(structure, page_list)
1091	await generate_summaries_for_structure(structure, model=opt.model)
1092	if opt.if_add_node_text == 'no':
1093	remove_structure_text(structure)
1094	if opt.if_add_doc_description == 'yes':
1095	# Create a clean structure without unnecessary fields for description generation
1096	clean_structure = create_clean_structure_for_description(structure)
1097	doc_description = generate_doc_description(clean_structure, model=opt.model)
1098	structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
1099	return {
1100	'doc_name': get_pdf_name(doc),
1101	'doc_description': doc_description,
1102	'structure': structure,
1103	}
1104	structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
1105	return {
1106	'doc_name': get_pdf_name(doc),
1107	'structure': structure,
1108	}
1109
1110	return asyncio.run(page_index_builder())
1111

page_index_mainFunction · 0.85

tree_parserFunction · 0.85

write_node_idFunction · 0.85

add_node_textFunction · 0.85

generate_summaries_for_structureFunction · 0.85

remove_structure_textFunction · 0.85

create_clean_structure_for_descriptionFunction · 0.85

generate_doc_descriptionFunction · 0.85

format_structureFunction · 0.85

get_pdf_nameFunction · 0.85

no test coverage detected