Function extract_json

pageindex/utils.py:99–130 · view source on GitHub ↗

(content)

Source from the content-addressed store, hash-verified

97
98
99	def extract_json(content):
100	try:
101	# First, try to extract JSON enclosed within ```json and ```
102	start_idx = content.find("```json")
103	if start_idx != -1:
104	start_idx += 7 # Adjust index to start after the delimiter
105	end_idx = content.rfind("```")
106	json_content = content[start_idx:end_idx].strip()
107	else:
108	# If no delimiters, assume entire content could be JSON
109	json_content = content.strip()
110
111	# Clean up common issues that might cause parsing errors
112	json_content = json_content.replace('None', 'null') # Replace Python None with JSON null
113	json_content = json_content.replace('\n', ' ').replace('\r', ' ') # Remove newlines
114	json_content = ' '.join(json_content.split()) # Normalize whitespace
115
116	# Attempt to parse and return the JSON object
117	return json.loads(json_content)
118	except json.JSONDecodeError as e:
119	logging.error(f"Failed to extract JSON: {e}")
120	# Try to clean up the content further if initial parsing fails
121	try:
122	# Remove any trailing commas before closing brackets/braces
123	json_content = json_content.replace(',]', ']').replace(',}', '}')
124	return json.loads(json_content)
125	except:
126	logging.error("Failed to parse JSON even after cleanup")
127	return {}
128	except Exception as e:
129	logging.error(f"Unexpected error while extracting JSON: {e}")
130	return {}
131
132	def write_node_id(data, node_id=0):
133	if isinstance(data, dict):

check_title_appearanceFunction · 0.85

check_title_appearance_in_startFunction · 0.85

toc_detector_single_pageFunction · 0.85

check_if_toc_extraction_is_completeFunction · 0.85

check_if_toc_transformation_is_completeFunction · 0.85

detect_page_indexFunction · 0.85

toc_index_extractorFunction · 0.85

toc_transformerFunction · 0.85

add_page_number_to_tocFunction · 0.85

generate_toc_continueFunction · 0.85

generate_toc_initFunction · 0.85

single_toc_item_index_fixerFunction · 0.85

errorMethod · 0.80

no test coverage detected