(epub_path: str, output_dir: str)
| 173 | # --- Main Conversion Logic --- |
| 174 | |
| 175 | def process_epub(epub_path: str, output_dir: str) -> Book: |
| 176 | |
| 177 | # 1. Load Book |
| 178 | print(f"Loading {epub_path}...") |
| 179 | book = epub.read_epub(epub_path) |
| 180 | |
| 181 | # 2. Extract Metadata |
| 182 | metadata = extract_metadata_robust(book) |
| 183 | |
| 184 | # 3. Prepare Output Directories |
| 185 | if os.path.exists(output_dir): |
| 186 | shutil.rmtree(output_dir) |
| 187 | images_dir = os.path.join(output_dir, 'images') |
| 188 | os.makedirs(images_dir, exist_ok=True) |
| 189 | |
| 190 | # 4. Extract Images & Build Map |
| 191 | print("Extracting images...") |
| 192 | image_map = {} # Key: internal_path, Value: local_relative_path |
| 193 | |
| 194 | for item in book.get_items(): |
| 195 | if item.get_type() == ebooklib.ITEM_IMAGE: |
| 196 | # Normalize filename |
| 197 | original_fname = os.path.basename(item.get_name()) |
| 198 | # Sanitize filename for OS |
| 199 | safe_fname = "".join([c for c in original_fname if c.isalpha() or c.isdigit() or c in '._-']).strip() |
| 200 | |
| 201 | # Save to disk |
| 202 | local_path = os.path.join(images_dir, safe_fname) |
| 203 | with open(local_path, 'wb') as f: |
| 204 | f.write(item.get_content()) |
| 205 | |
| 206 | # Map keys: We try both the full internal path and just the basename |
| 207 | # to be robust against messy HTML src attributes |
| 208 | rel_path = f"images/{safe_fname}" |
| 209 | image_map[item.get_name()] = rel_path |
| 210 | image_map[original_fname] = rel_path |
| 211 | |
| 212 | # 5. Process TOC |
| 213 | print("Parsing Table of Contents...") |
| 214 | toc_structure = parse_toc_recursive(book.toc) |
| 215 | if not toc_structure: |
| 216 | print("Warning: Empty TOC, building fallback from Spine...") |
| 217 | toc_structure = get_fallback_toc(book) |
| 218 | |
| 219 | # 6. Process Content (Spine-based to preserve HTML validity) |
| 220 | print("Processing chapters...") |
| 221 | spine_chapters = [] |
| 222 | |
| 223 | # We iterate over the spine (linear reading order) |
| 224 | for i, spine_item in enumerate(book.spine): |
| 225 | item_id, linear = spine_item |
| 226 | item = book.get_item_with_id(item_id) |
| 227 | |
| 228 | if not item: |
| 229 | continue |
| 230 | |
| 231 | if item.get_type() == ebooklib.ITEM_DOCUMENT: |
| 232 | # Raw content |
no test coverage detected