Skip to content

Commit

Permalink
Added processing of unknown elements. Closes #30
Browse files Browse the repository at this point in the history
  • Loading branch information
jemrobinson committed Dec 13, 2018
1 parent a602e8c commit cc477e0
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 87 deletions.
30 changes: 24 additions & 6 deletions plain_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ def block_level_whitelist():
return elements


def known_elements():
"""All elements that we know by name."""
structural_elements = ["html", "head", "body", "meta"]
return structural_elements + elements_to_delete() + elements_to_replace_with_contents() + special_elements() + block_level_whitelist()


def remove_metadata(soup):
"""Remove comments and doctype."""
for comment in soup.findAll(string=lambda text:any([isinstance(text, x) for x in [CData, Comment, Doctype]])):
Expand Down Expand Up @@ -170,12 +176,14 @@ def strip_attributes(soup):
for element in soup.find_all():
element.attrs = {}

def remove_empty_elements(soup):
"""Remove any elements which contain only whitespace."""
for element in soup.find_all():
if not element.contents:
print(" ELEMENT:", element.name, str(element).strip(), element.contents)
element.decompose()

# def remove_empty_elements(soup):
# """Remove any elements which contain only whitespace."""
# for element in soup.find_all():
# if not element.contents:
# print(" ELEMENT:", element.name, str(element).strip(), element.contents)
# element.decompose()


def recursively_prune(soup):
"""Recursively prune out any elements which have no children."""
Expand All @@ -190,6 +198,13 @@ def single_replace():
pass


def process_unknown_elements(soup):
"""Replace any unknown elements with their contents."""
for element in soup.find_all():
if element.name not in known_elements():
element.unwrap()


def parse_to_tree(html):
# Convert the HTML into a Soup parse tree
soup = BeautifulSoup(html, "html5lib")
Expand All @@ -206,6 +221,9 @@ def parse_to_tree(html):
# Process elements with special innerText handling
process_special_elements(soup)

# Process unknown elements
process_unknown_elements(soup)

# Remove empty string elements
remove_empty_strings(soup)

Expand Down
23 changes: 12 additions & 11 deletions tests/data/addictinginfo.com-1_full_page.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#">
<head profile="http://gmpg.org/xfn/11">
Expand Down Expand Up @@ -59,14 +58,14 @@
<link rel='stylesheet' id='media-queries-css' href='http://addictinginfo.com/wp-content/themes/tribuneTwo/media-queries.css?ver=4.9.8' type='text/css' media='all' />
<link rel='stylesheet' id='wzslider-css' href='http://addictinginfo.com/wp-content/themes/tribuneTwo/functions/wpzoom/assets/css/wzslider.css?ver=4.9.8' type='text/css' media='all' />
<link rel='stylesheet' id='wpzoom-custom-css' href='http://addictinginfo.com/wp-content/themes/tribuneTwo/custom.css?ver=4.9.8' type='text/css' media='all' />
<link rel='stylesheet' id='jetpack_css-css' href='http://addictinginfo.com/wp-content/plugins/jetpack/css/jetpack.css?ver=6.6.1' type='text/css' media='all' />
<link rel='stylesheet' id='jetpack_css-css' href='http://addictinginfo.com/wp-content/plugins/jetpack/css/jetpack.css?ver=6.8.1' type='text/css' media='all' />
<script type='text/javascript' src='http://addictinginfo.com/wp-includes/js/jquery/jquery.js?ver=1.12.4'></script>
<script type='text/javascript' src='http://addictinginfo.com/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1'></script>
<script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/js/init.js?ver=4.9.8'></script>
<link rel='https://api.w.org/' href='http://addictinginfo.com/wp-json/' />
<link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://addictinginfo.com/xmlrpc.php?rsd" />
<link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://addictinginfo.com/wp-includes/wlwmanifest.xml" />
<link rel='prev' title='Palin F*cks Up Royally With Instagram Meme, Shows How Racist She Is (IMAGES)' href='http://addictinginfo.com/2018/10/14/palin-fcks-up-royally-instagram-meme-shows-how-racist-she-is/' />
<link rel='next' title='In A Rambling Interview, Trump Insists That He&#8217;s Too &#8216;Busy&#8217; To Visit The Troops Overseas' href='http://addictinginfo.com/2018/10/18/in-a-rambling-interview-trump-insists-that-hes-too-busy-to-visit-the-troops-overseas/' />
<meta name="generator" content="WordPress 4.9.8" />
<link rel='shortlink' href='http://addictinginfo.com/?p=342398' />

Expand All @@ -91,6 +90,7 @@
a{color:#cc0000;}
a:hover{color:#000000;}
</style>
<script type="text/javascript">document.write('<style type="text/css">.tabber{display:none;}</style>');</script> <script src="https://cdn.onesignal.com/sdks/OneSignalSDK.js" async></script> <script>

window.OneSignal = window.OneSignal || [];

Expand Down Expand Up @@ -242,7 +242,7 @@
<li id="menu-item-341224" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-341224"><a href="http://addictinginfo.com/terms-of-service/">Terms of Service</a></li>
</ul>
</div>
<div class="clear"></div>
<div class="clear"></div>
</div>
<div id="header-inner">
<div id="head_banner">
Expand Down Expand Up @@ -459,6 +459,8 @@ <h1 class="entry-title">
</div><div id="archives-5" class="widget widget_archive"><h3 class="title">Archives</h3> <label class="screen-reader-text" for="archives-dropdown-5">Archives</label>
<select id="archives-dropdown-5" name="archive-dropdown" onchange='document.location.href=this.options[this.selectedIndex].value;'>
<option value="">Select Month</option>
<option value='http://addictinginfo.com/2018/12/'> December 2018 </option>
<option value='http://addictinginfo.com/2018/11/'> November 2018 </option>
<option value='http://addictinginfo.com/2018/10/'> October 2018 </option>
<option value='http://addictinginfo.com/2018/09/'> September 2018 </option>
<option value='http://addictinginfo.com/2018/08/'> August 2018 </option>
Expand Down Expand Up @@ -570,7 +572,7 @@ <h1 class="entry-title">
<a href="https://www.facebook.com/sharer/sharer.php?u=http://addictinginfo.com/2018/10/15/trump-denies-charitable-donation-he-promised-if-elizabeth-warren-releases-dna-results-and-its-on-video/" target="_blank" class="socicon-facebook fburl socialfooter-facebook">
<div class="socialfooter-facebooktext">
<span></span> Share on Facebook
</div>
</div>
</a>
<a href="#commentssection" class="socialfooter-comments">
<div class="socialfooter-commentstext">
Expand All @@ -580,7 +582,6 @@ <h1 class="entry-title">
</div>

<div class="clear"></div>

</div>
</div>
<div class="clear"></div>
Expand All @@ -600,17 +601,17 @@ <h2>
</div>
</div>
<script async type="text/javascript" src="https://apis.google.com/js/plusone.js"></script>
<script>(function($){$(document).ready(function(){});})(jQuery);</script><script type="text/javascript">/* <![CDATA[ */ jQuery(document).ready( function() { jQuery.post( "http://addictinginfo.com/wp-admin/admin-ajax.php", { action : "entry_views", _ajax_nonce : "fcbecde4ba", post_id : 342398 } ); } ); /* ]]> */</script>
<script type='text/javascript' src='https://s0.wp.com/wp-content/js/devicepx-jetpack.js?ver=201842'></script>
<script>(function($){$(document).ready(function(){});})(jQuery);</script><script type="text/javascript">/* <![CDATA[ */ jQuery(document).ready( function() { jQuery.post( "http://addictinginfo.com/wp-admin/admin-ajax.php", { action : "entry_views", _ajax_nonce : "8660f9b1fc", post_id : 342398 } ); } ); /* ]]> */</script>
<script type='text/javascript' src='https://s0.wp.com/wp-content/js/devicepx-jetpack.js?ver=201850'></script>
<script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/js/fredsel.js?ver=4.9.8'></script>
<script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/js/tabs.js?ver=4.9.8'></script>
<script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/js/dropdown.js?ver=4.9.8'></script>
<script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/functions/wpzoom/assets/js/galleria.js'></script>
<script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/functions/wpzoom/assets/js/wzslider.js'></script>
<script type='text/javascript' src='https://stats.wp.com/e-201842.js' async='async' defer='defer'></script>
<script type='text/javascript' src='https://stats.wp.com/e-201850.js' async='async' defer='defer'></script>
<script type='text/javascript'>
_stq = window._stq || [];
_stq.push([ 'view', {v:'ext',j:'1:6.6.1',blog:'132801336',post:'342398',tz:'-4',srv:'addictinginfo.com'} ]);
_stq.push([ 'view', {v:'ext',j:'1:6.8.1',blog:'132801336',post:'342398',tz:'-5',srv:'addictinginfo.com'} ]);
_stq.push([ 'clickTrackerInit', '132801336', '342398' ]);
</script>
<script async type="text/javascript">
Expand Down Expand Up @@ -661,4 +662,4 @@ <h2>
</noscript>

</body>
</html>
</html>
125 changes: 55 additions & 70 deletions tests/test_html_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,61 +91,51 @@ def test_html_whitelist_caption():
<th>H</th>
<th>T</th>
</tr>
<tr>
<th>H</th>
<td>HH</td>
<td>TH</td>
</tr>
<tr>
<th>T</th>
<td>HT</td>
<td>TT</td>
</tr>
</tbody>
</table>
""")


def test_html_whitelist_colgroup():
"""The colgroup element groups col elements inside its parent table."""
check_html_output_contains_text("""
<table>
<colgroup>
<col span="2" style="background-color:red"/>
<col style="background-color:yellow"/>
</colgroup>
<tr>
<th>ISBN</th>
<th>Title</th>
<th>Price</th>
</tr>
</table>
""", """
<colgroup>
<col/>
<col/>
</colgroup>
""")


def test_html_whitelist_col():
"""The col element describes one or more columns in a table."""
check_html_output_contains_text("""
<table>
<colgroup>
<col span="2" style="background-color:red"/>
<col style="background-color:yellow"/>
</colgroup>
<tr>
<th>ISBN</th>
<th>Title</th>
<th>Price</th>
</tr>
</table>
""", """
<col/>
<col/>
""")
""", "<caption>Table 1. This shows the possible results of flipping two coins.</caption>")


# def test_html_whitelist_colgroup():
# """The colgroup element groups col elements inside its parent table."""
# check_html_output_contains_text("""
# <table>
# <colgroup>
# <col span="2" style="background-color:red"/>
# <col style="background-color:yellow"/>
# </colgroup>
# <tr>
# <th>ISBN</th>
# <th>Title</th>
# <th>Price</th>
# </tr>
# </table>
# """, """
# <colgroup>
# <col/>
# <col/>
# </colgroup>
# """)


# def test_html_whitelist_col():
# """The col element describes one or more columns in a table."""
# check_html_output_contains_text("""
# <table>
# <colgroup>
# <col span="2" style="background-color:red"/>
# <col style="background-color:yellow"/>
# </colgroup>
# <tr>
# <th>ISBN</th>
# <th>Title</th>
# <th>Price</th>
# </tr>
# </table>
# """, """
# <col/>
# <col/>
# """)


def test_html_whitelist_div():
Expand Down Expand Up @@ -367,22 +357,26 @@ def test_html_whitelist_table():
"""The table element represents data with more than one dimension."""
check_html_output_contains_text("""
<table>
<tr>
<td>Table contents</td>
</tr>
<tbody>
<tr>
<td>Table contents</td>
</tr>
</tbody>
</table>
""", "<table><tr><td>Table contents</td></tr></table>")
""", "<table><tbody><tr><td>Table contents</td></tr></tbody></table>")


def test_html_whitelist_tbody():
"""The tbody element represents a block of rows inside its parent table."""
check_html_output_contains_text("""
<table>
<tbody>
<td>Table body content</td>
<tr>
<td>Table body content</td>
</tr>
</tbody>
</table>
""", "<tbody><td>Table body content</td></tbody>")
""", "<tbody><tr><td>Table body content</td></tr></tbody>")


def test_html_whitelist_thead():
Expand Down Expand Up @@ -911,15 +905,6 @@ def test_html_blacklist_link():
""", "link")


def test_html_blacklist_time():
"""The time element has a time and a machine-readable datetime."""
check_html_output_does_not_contain_tag("""
<p>
We open at <time datetime="2018-11-21 10:00">10:00 tomorrow</time>.
</p>
""", "time")


def test_html_blacklist_style():
"""The style element embeds style information in the document."""
check_html_output_does_not_contain_tag("""
Expand Down Expand Up @@ -992,8 +977,8 @@ def test_html_special_sup():
@mark.parametrize("element", ["a", "abbr", "address", "b", "bdi", "bdo",
"cite", "code", "del", "dfn", "em", "i", "ins",
"kbs", "mark", "rb", "ruby", "rp", "rt", "rtc",
"s", "samp", "small", "span", "strong", "u",
"var", "wbr"])
"s", "samp", "small", "span", "strong", "time",
"u", "var", "wbr"])
def test_html_remaining_element(element):
"""Simple standalone elements which can contain text.
Check that the inner text is kept and the tag is discarded."""
Expand Down

0 comments on commit cc477e0

Please sign in to comment.