MCPcopy Index your code
hub / github.com/karpathy/reader3 / process_epub

Function process_epub

reader3.py:175–283  ·  view source on GitHub ↗
(epub_path: str, output_dir: str)

Source from the content-addressed store, hash-verified

173# --- Main Conversion Logic ---
174
175def process_epub(epub_path: str, output_dir: str) -> Book:
176
177 # 1. Load Book
178 print(f"Loading {epub_path}...")
179 book = epub.read_epub(epub_path)
180
181 # 2. Extract Metadata
182 metadata = extract_metadata_robust(book)
183
184 # 3. Prepare Output Directories
185 if os.path.exists(output_dir):
186 shutil.rmtree(output_dir)
187 images_dir = os.path.join(output_dir, 'images')
188 os.makedirs(images_dir, exist_ok=True)
189
190 # 4. Extract Images & Build Map
191 print("Extracting images...")
192 image_map = {} # Key: internal_path, Value: local_relative_path
193
194 for item in book.get_items():
195 if item.get_type() == ebooklib.ITEM_IMAGE:
196 # Normalize filename
197 original_fname = os.path.basename(item.get_name())
198 # Sanitize filename for OS
199 safe_fname = "".join([c for c in original_fname if c.isalpha() or c.isdigit() or c in '._-']).strip()
200
201 # Save to disk
202 local_path = os.path.join(images_dir, safe_fname)
203 with open(local_path, 'wb') as f:
204 f.write(item.get_content())
205
206 # Map keys: We try both the full internal path and just the basename
207 # to be robust against messy HTML src attributes
208 rel_path = f"images/{safe_fname}"
209 image_map[item.get_name()] = rel_path
210 image_map[original_fname] = rel_path
211
212 # 5. Process TOC
213 print("Parsing Table of Contents...")
214 toc_structure = parse_toc_recursive(book.toc)
215 if not toc_structure:
216 print("Warning: Empty TOC, building fallback from Spine...")
217 toc_structure = get_fallback_toc(book)
218
219 # 6. Process Content (Spine-based to preserve HTML validity)
220 print("Processing chapters...")
221 spine_chapters = []
222
223 # We iterate over the spine (linear reading order)
224 for i, spine_item in enumerate(book.spine):
225 item_id, linear = spine_item
226 item = book.get_item_with_id(item_id)
227
228 if not item:
229 continue
230
231 if item.get_type() == ebooklib.ITEM_DOCUMENT:
232 # Raw content

Callers 1

reader3.pyFile · 0.85

Calls 7

extract_metadata_robustFunction · 0.85
parse_toc_recursiveFunction · 0.85
get_fallback_tocFunction · 0.85
clean_html_contentFunction · 0.85
ChapterContentClass · 0.85
extract_plain_textFunction · 0.85
BookClass · 0.85

Tested by

no test coverage detected