From cc477e0d53073031dce3511318ebfb6c87f43ac9 Mon Sep 17 00:00:00 2001 From: James Robinson Date: Thu, 13 Dec 2018 12:21:21 +0000 Subject: [PATCH] Added processing of unknown elements. Closes #30 --- plain_html.py | 30 ++++- tests/data/addictinginfo.com-1_full_page.html | 23 ++-- tests/test_html_elements.py | 125 ++++++++---------- 3 files changed, 91 insertions(+), 87 deletions(-) diff --git a/plain_html.py b/plain_html.py index 071dbb6..d6d9a98 100644 --- a/plain_html.py +++ b/plain_html.py @@ -45,6 +45,12 @@ def block_level_whitelist(): return elements +def known_elements(): + """All elements that we know by name.""" + structural_elements = ["html", "head", "body", "meta"] + return structural_elements + elements_to_delete() + elements_to_replace_with_contents() + special_elements() + block_level_whitelist() + + def remove_metadata(soup): """Remove comments and doctype.""" for comment in soup.findAll(string=lambda text:any([isinstance(text, x) for x in [CData, Comment, Doctype]])): @@ -170,12 +176,14 @@ def strip_attributes(soup): for element in soup.find_all(): element.attrs = {} -def remove_empty_elements(soup): - """Remove any elements which contain only whitespace.""" - for element in soup.find_all(): - if not element.contents: - print(" ELEMENT:", element.name, str(element).strip(), element.contents) - element.decompose() + +# def remove_empty_elements(soup): +# """Remove any elements which contain only whitespace.""" +# for element in soup.find_all(): +# if not element.contents: +# print(" ELEMENT:", element.name, str(element).strip(), element.contents) +# element.decompose() + def recursively_prune(soup): """Recursively prune out any elements which have no children.""" @@ -190,6 +198,13 @@ def single_replace(): pass +def process_unknown_elements(soup): + """Replace any unknown elements with their contents.""" + for element in soup.find_all(): + if element.name not in known_elements(): + element.unwrap() + + def parse_to_tree(html): # Convert the HTML into a Soup parse tree soup = BeautifulSoup(html, "html5lib") @@ -206,6 +221,9 @@ def parse_to_tree(html): # Process elements with special innerText handling process_special_elements(soup) + # Process unknown elements + process_unknown_elements(soup) + # Remove empty string elements remove_empty_strings(soup) diff --git a/tests/data/addictinginfo.com-1_full_page.html b/tests/data/addictinginfo.com-1_full_page.html index 03fae58..74f2442 100644 --- a/tests/data/addictinginfo.com-1_full_page.html +++ b/tests/data/addictinginfo.com-1_full_page.html @@ -1,4 +1,3 @@ - @@ -59,14 +58,14 @@ - + - + @@ -91,6 +90,7 @@ a{color:#cc0000;} a:hover{color:#000000;} + - - + + - +