hub / github.com/karpathy/reader3 / process_epub

Function process_epub

reader3.py:175–283 · view source on GitHub ↗

(epub_path: str, output_dir: str)

Source from the content-addressed store, hash-verified

173	# --- Main Conversion Logic ---
174
175	def process_epub(epub_path: str, output_dir: str) -> Book:
176
177	# 1. Load Book
178	print(f"Loading {epub_path}...")
179	book = epub.read_epub(epub_path)
180
181	# 2. Extract Metadata
182	metadata = extract_metadata_robust(book)
183
184	# 3. Prepare Output Directories
185	if os.path.exists(output_dir):
186	shutil.rmtree(output_dir)
187	images_dir = os.path.join(output_dir, 'images')
188	os.makedirs(images_dir, exist_ok=True)
189
190	# 4. Extract Images & Build Map
191	print("Extracting images...")
192	image_map = {} # Key: internal_path, Value: local_relative_path
193
194	for item in book.get_items():
195	if item.get_type() == ebooklib.ITEM_IMAGE:
196	# Normalize filename
197	original_fname = os.path.basename(item.get_name())
198	# Sanitize filename for OS
199	safe_fname = "".join([c for c in original_fname if c.isalpha() or c.isdigit() or c in '._-']).strip()
200
201	# Save to disk
202	local_path = os.path.join(images_dir, safe_fname)
203	with open(local_path, 'wb') as f:
204	f.write(item.get_content())
205
206	# Map keys: We try both the full internal path and just the basename
207	# to be robust against messy HTML src attributes
208	rel_path = f"images/{safe_fname}"
209	image_map[item.get_name()] = rel_path
210	image_map[original_fname] = rel_path
211
212	# 5. Process TOC
213	print("Parsing Table of Contents...")
214	toc_structure = parse_toc_recursive(book.toc)
215	if not toc_structure:
216	print("Warning: Empty TOC, building fallback from Spine...")
217	toc_structure = get_fallback_toc(book)
218
219	# 6. Process Content (Spine-based to preserve HTML validity)
220	print("Processing chapters...")
221	spine_chapters = []
222
223	# We iterate over the spine (linear reading order)
224	for i, spine_item in enumerate(book.spine):
225	item_id, linear = spine_item
226	item = book.get_item_with_id(item_id)
227
228	if not item:
229	continue
230
231	if item.get_type() == ebooklib.ITEM_DOCUMENT:
232	# Raw content

Callers 1

reader3.pyFile · 0.85

Calls 7

extract_metadata_robustFunction · 0.85

parse_toc_recursiveFunction · 0.85

get_fallback_tocFunction · 0.85

clean_html_contentFunction · 0.85

ChapterContentClass · 0.85

extract_plain_textFunction · 0.85

BookClass · 0.85

Tested by

no test coverage detected