(text: str)
| 2 | |
| 3 | |
| 4 | def extract_html_content(text: str) -> str: |
| 5 | file_match = re.search( |
| 6 | r"<file\s+path=\"[^\"]+\">\s*(.*?)\s*</file>", |
| 7 | text, |
| 8 | re.DOTALL | re.IGNORECASE, |
| 9 | ) |
| 10 | if file_match: |
| 11 | return extract_html_content(file_match.group(1).strip()) |
| 12 | |
| 13 | # First, strip markdown code fences if present |
| 14 | text = re.sub(r'^```html?\s*\n?', '', text, flags=re.MULTILINE) |
| 15 | text = re.sub(r'\n?```\s*$', '', text, flags=re.MULTILINE) |
| 16 | |
| 17 | # Try to find DOCTYPE + html tags together |
| 18 | match_with_doctype = re.search( |
| 19 | r"(<!DOCTYPE\s+html[^>]*>.*?<html.*?>.*?</html>)", text, re.DOTALL | re.IGNORECASE |
| 20 | ) |
| 21 | if match_with_doctype: |
| 22 | return match_with_doctype.group(1) |
| 23 | |
| 24 | # Fall back to just <html> tags |
| 25 | match = re.search(r"(<html.*?>.*?</html>)", text, re.DOTALL) |
| 26 | if match: |
| 27 | return match.group(1) |
| 28 | else: |
| 29 | # Otherwise, we just send the previous HTML over |
| 30 | print( |
| 31 | "[HTML Extraction] No <html> tags found in the generated content" |
| 32 | ) |
| 33 | return text |
no outgoing calls