Takes article HTML string input and outputs the fulltext Input string is decoded via UnicodeDammit if needed
(html, language='en')
| 70 | |
| 71 | |
| 72 | def fulltext(html, language='en'): |
| 73 | """Takes article HTML string input and outputs the fulltext |
| 74 | Input string is decoded via UnicodeDammit if needed |
| 75 | """ |
| 76 | from .cleaners import DocumentCleaner |
| 77 | from .configuration import Configuration |
| 78 | from .extractors import ContentExtractor |
| 79 | from .outputformatters import OutputFormatter |
| 80 | |
| 81 | config = Configuration() |
| 82 | config.language = language |
| 83 | |
| 84 | extractor = ContentExtractor(config) |
| 85 | document_cleaner = DocumentCleaner(config) |
| 86 | output_formatter = OutputFormatter(config) |
| 87 | |
| 88 | doc = config.get_parser().fromstring(html) |
| 89 | doc = document_cleaner.clean(doc) |
| 90 | |
| 91 | top_node = extractor.calculate_best_node(doc) |
| 92 | top_node = extractor.post_cleanup(top_node) |
| 93 | text, article_html = output_formatter.get_formatted(top_node) |
| 94 | return text |
searching dependent graphs…