Returns a string with all HTML tags removed. Content inside HTML comments, the tag and the tags is removed. - keep : a list of tags to keep. Element attributes are stripped. To preserve attributes a dict of (tag name, [attribute])-item
(html, keep=[], replace=blocks, linebreaks=2, indentation=False)
| 878 | return string |
| 879 | |
| 880 | def plaintext(html, keep=[], replace=blocks, linebreaks=2, indentation=False): |
| 881 | """ Returns a string with all HTML tags removed. |
| 882 | Content inside HTML comments, the <style> tag and the <script> tags is removed. |
| 883 | - keep : a list of tags to keep. Element attributes are stripped. |
| 884 | To preserve attributes a dict of (tag name, [attribute])-items can be given. |
| 885 | - replace : a dictionary of (tag name, (replace_before, replace_after))-items. |
| 886 | By default, block-level elements are followed by linebreaks. |
| 887 | - linebreaks : the maximum amount of consecutive linebreaks, |
| 888 | - indentation : keep left line indentation (tabs and spaces)? |
| 889 | """ |
| 890 | if not keep.__contains__("script"): |
| 891 | html = strip_javascript(html) |
| 892 | if not keep.__contains__("style"): |
| 893 | html = strip_inline_css(html) |
| 894 | if not keep.__contains__("form"): |
| 895 | html = strip_forms(html) |
| 896 | if not keep.__contains__("comment") and \ |
| 897 | not keep.__contains__("!--"): |
| 898 | html = strip_comments(html) |
| 899 | html = html.replace("\r", "\n") |
| 900 | html = strip_tags(html, exclude=keep, replace=replace) |
| 901 | html = decode_entities(html) |
| 902 | html = collapse_spaces(html, indentation) |
| 903 | html = collapse_tabs(html, indentation) |
| 904 | html = collapse_linebreaks(html, linebreaks) |
| 905 | html = html.strip() |
| 906 | return html |
| 907 | |
| 908 | #### SEARCH ENGINE ################################################################################# |
| 909 |
no test coverage detected
searching dependent graphs…