From cc477e0d53073031dce3511318ebfb6c87f43ac9 Mon Sep 17 00:00:00 2001
From: James Robinson <james.em.robinson@gmail.com>
Date: Thu, 13 Dec 2018 12:21:21 +0000
Subject: [PATCH] Added processing of unknown elements. Closes #30

---
 plain_html.py                                 |  30 ++++-
 tests/data/addictinginfo.com-1_full_page.html |  23 ++--
 tests/test_html_elements.py                   | 125 ++++++++----------
 3 files changed, 91 insertions(+), 87 deletions(-)
diff --git a/plain_html.py b/plain_html.py
index 071dbb6..d6d9a98 100644
--- a/plain_html.py
+++ b/plain_html.py
@@ -45,6 +45,12 @@ def block_level_whitelist():
     return elements
 
 
+def known_elements():
+    """All elements that we know by name."""
+    structural_elements = ["html", "head", "body", "meta"]
+    return structural_elements + elements_to_delete() + elements_to_replace_with_contents() + special_elements() + block_level_whitelist()
+
+
 def remove_metadata(soup):
     """Remove comments and doctype."""
     for comment in soup.findAll(string=lambda text:any([isinstance(text, x) for x in [CData, Comment, Doctype]])):
@@ -170,12 +176,14 @@ def strip_attributes(soup):
     for element in soup.find_all():
         element.attrs = {}
 
-def remove_empty_elements(soup):
-    """Remove any elements which contain only whitespace."""
-    for element in soup.find_all():
-        if not element.contents:
-            print("  ELEMENT:", element.name, str(element).strip(), element.contents)
-            element.decompose()
+
+# def remove_empty_elements(soup):
+#     """Remove any elements which contain only whitespace."""
+#     for element in soup.find_all():
+#         if not element.contents:
+#             print("  ELEMENT:", element.name, str(element).strip(), element.contents)
+#             element.decompose()
+
 
 def recursively_prune(soup):
     """Recursively prune out any elements which have no children."""
@@ -190,6 +198,13 @@ def single_replace():
         pass
 
 
+def process_unknown_elements(soup):
+    """Replace any unknown elements with their contents."""
+    for element in soup.find_all():
+        if element.name not in known_elements():
+            element.unwrap()
+
+
 def parse_to_tree(html):
     # Convert the HTML into a Soup parse tree
     soup = BeautifulSoup(html, "html5lib")
@@ -206,6 +221,9 @@ def parse_to_tree(html):
     # Process elements with special innerText handling
     process_special_elements(soup)
 
+    # Process unknown elements
+    process_unknown_elements(soup)
+
     # Remove empty string elements
     remove_empty_strings(soup)
 
diff --git a/tests/data/addictinginfo.com-1_full_page.html b/tests/data/addictinginfo.com-1_full_page.html
index 03fae58..74f2442 100644
--- a/tests/data/addictinginfo.com-1_full_page.html
+++ b/tests/data/addictinginfo.com-1_full_page.html
@@ -1,4 +1,3 @@
-
 <!DOCTYPE html>
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#">
 <head profile="http://gmpg.org/xfn/11">
@@ -59,14 +58,14 @@
 <link rel='stylesheet' id='media-queries-css' href='http://addictinginfo.com/wp-content/themes/tribuneTwo/media-queries.css?ver=4.9.8' type='text/css' media='all' />
 <link rel='stylesheet' id='wzslider-css' href='http://addictinginfo.com/wp-content/themes/tribuneTwo/functions/wpzoom/assets/css/wzslider.css?ver=4.9.8' type='text/css' media='all' />
 <link rel='stylesheet' id='wpzoom-custom-css' href='http://addictinginfo.com/wp-content/themes/tribuneTwo/custom.css?ver=4.9.8' type='text/css' media='all' />
-<link rel='stylesheet' id='jetpack_css-css' href='http://addictinginfo.com/wp-content/plugins/jetpack/css/jetpack.css?ver=6.6.1' type='text/css' media='all' />
+<link rel='stylesheet' id='jetpack_css-css' href='http://addictinginfo.com/wp-content/plugins/jetpack/css/jetpack.css?ver=6.8.1' type='text/css' media='all' />
 <script type='text/javascript' src='http://addictinginfo.com/wp-includes/js/jquery/jquery.js?ver=1.12.4'></script>
 <script type='text/javascript' src='http://addictinginfo.com/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1'></script>
 <script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/js/init.js?ver=4.9.8'></script>
 <link rel='https://api.w.org/' href='http://addictinginfo.com/wp-json/' />
-<link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://addictinginfo.com/xmlrpc.php?rsd" />
 <link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://addictinginfo.com/wp-includes/wlwmanifest.xml" />
 <link rel='prev' title='Palin F*cks Up Royally With Instagram Meme, Shows How Racist She Is (IMAGES)' href='http://addictinginfo.com/2018/10/14/palin-fcks-up-royally-instagram-meme-shows-how-racist-she-is/' />
+<link rel='next' title='In A Rambling Interview, Trump Insists That He&#8217;s Too &#8216;Busy&#8217; To Visit The Troops Overseas' href='http://addictinginfo.com/2018/10/18/in-a-rambling-interview-trump-insists-that-hes-too-busy-to-visit-the-troops-overseas/' />
 <meta name="generator" content="WordPress 4.9.8" />
 <link rel='shortlink' href='http://addictinginfo.com/?p=342398' />
 
@@ -91,6 +90,7 @@
 a{color:#cc0000;}
 a:hover{color:#000000;}
 </style>
+<script type="text/javascript">document.write('<style type="text/css">.tabber{display:none;}</style>');</script> <script src="https://cdn.onesignal.com/sdks/OneSignalSDK.js" async></script> <script>
 
       window.OneSignal = window.OneSignal || [];
 
@@ -242,7 +242,7 @@
 <li id="menu-item-341224" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-341224"><a href="http://addictinginfo.com/terms-of-service/">Terms of Service</a></li>
 </ul>
 </div>
- <div class="clear"></div>
+<div class="clear"></div>
 </div>
 <div id="header-inner">
 <div id="head_banner">
@@ -459,6 +459,8 @@ <h1 class="entry-title">
 </div><div id="archives-5" class="widget widget_archive"><h3 class="title">Archives</h3> <label class="screen-reader-text" for="archives-dropdown-5">Archives</label>
 <select id="archives-dropdown-5" name="archive-dropdown" onchange='document.location.href=this.options[this.selectedIndex].value;'>
 <option value="">Select Month</option>
+<option value='http://addictinginfo.com/2018/12/'> December 2018 </option>
+<option value='http://addictinginfo.com/2018/11/'> November 2018 </option>
 <option value='http://addictinginfo.com/2018/10/'> October 2018 </option>
 <option value='http://addictinginfo.com/2018/09/'> September 2018 </option>
 <option value='http://addictinginfo.com/2018/08/'> August 2018 </option>
@@ -570,7 +572,7 @@ <h1 class="entry-title">
 <a href="https://www.facebook.com/sharer/sharer.php?u=http://addictinginfo.com/2018/10/15/trump-denies-charitable-donation-he-promised-if-elizabeth-warren-releases-dna-results-and-its-on-video/" target="_blank" class="socicon-facebook fburl socialfooter-facebook">
 <div class="socialfooter-facebooktext">
 <span></span> Share on Facebook
-</div>
+ </div>
 </a>
 <a href="#commentssection" class="socialfooter-comments">
 <div class="socialfooter-commentstext">
@@ -580,7 +582,6 @@ <h1 class="entry-title">
 </div>
 
 <div class="clear"></div>
-
 </div>
 </div>
 <div class="clear"></div>
@@ -600,17 +601,17 @@ <h2>
 </div>
 </div>
 <script async type="text/javascript" src="https://apis.google.com/js/plusone.js"></script>
-<script>(function($){$(document).ready(function(){});})(jQuery);</script><script type="text/javascript">/* <![CDATA[ */ jQuery(document).ready( function() { jQuery.post( "http://addictinginfo.com/wp-admin/admin-ajax.php", { action : "entry_views", _ajax_nonce : "fcbecde4ba", post_id : 342398 } ); } ); /* ]]> */</script>
-<script type='text/javascript' src='https://s0.wp.com/wp-content/js/devicepx-jetpack.js?ver=201842'></script>
+<script>(function($){$(document).ready(function(){});})(jQuery);</script><script type="text/javascript">/* <![CDATA[ */ jQuery(document).ready( function() { jQuery.post( "http://addictinginfo.com/wp-admin/admin-ajax.php", { action : "entry_views", _ajax_nonce : "8660f9b1fc", post_id : 342398 } ); } ); /* ]]> */</script>
+<script type='text/javascript' src='https://s0.wp.com/wp-content/js/devicepx-jetpack.js?ver=201850'></script>
 <script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/js/fredsel.js?ver=4.9.8'></script>
 <script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/js/tabs.js?ver=4.9.8'></script>
 <script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/js/dropdown.js?ver=4.9.8'></script>
 <script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/functions/wpzoom/assets/js/galleria.js'></script>
 <script type='text/javascript' src='http://addictinginfo.com/wp-content/themes/tribuneTwo/functions/wpzoom/assets/js/wzslider.js'></script>
-<script type='text/javascript' src='https://stats.wp.com/e-201842.js' async='async' defer='defer'></script>
+<script type='text/javascript' src='https://stats.wp.com/e-201850.js' async='async' defer='defer'></script>
 <script type='text/javascript'>
 	_stq = window._stq || [];
-	_stq.push([ 'view', {v:'ext',j:'1:6.6.1',blog:'132801336',post:'342398',tz:'-4',srv:'addictinginfo.com'} ]);
+	_stq.push([ 'view', {v:'ext',j:'1:6.8.1',blog:'132801336',post:'342398',tz:'-5',srv:'addictinginfo.com'} ]);
 	_stq.push([ 'clickTrackerInit', '132801336', '342398' ]);
 </script>
 <script async type="text/javascript">
@@ -661,4 +662,4 @@ <h2>
 </noscript>
 
 </body>
-</html>
+</html>
\ No newline at end of file
diff --git a/tests/test_html_elements.py b/tests/test_html_elements.py
index 81346b7..47ee141 100644
--- a/tests/test_html_elements.py
+++ b/tests/test_html_elements.py
@@ -91,61 +91,51 @@ def test_html_whitelist_caption():
                     <th>H</th>
                     <th>T</th>
                 </tr>
-                <tr>
-                    <th>H</th>
-                    <td>HH</td>
-                    <td>TH</td>
-                </tr>
-                <tr>
-                    <th>T</th>
-                    <td>HT</td>
-                    <td>TT</td>
-                </tr>
             </tbody>
         </table>
-    """)
-
-
-def test_html_whitelist_colgroup():
-    """The colgroup element groups col elements inside its parent table."""
-    check_html_output_contains_text("""
-        <table>
-        <colgroup>
-            <col span="2" style="background-color:red"/>
-            <col style="background-color:yellow"/>
-        </colgroup>
-        <tr>
-            <th>ISBN</th>
-            <th>Title</th>
-            <th>Price</th>
-        </tr>
-        </table>
-    """, """
-        <colgroup>
-            <col/>
-            <col/>
-        </colgroup>
-    """)
-
-
-def test_html_whitelist_col():
-    """The col element describes one or more columns in a table."""
-    check_html_output_contains_text("""
-        <table>
-        <colgroup>
-            <col span="2" style="background-color:red"/>
-            <col style="background-color:yellow"/>
-        </colgroup>
-        <tr>
-            <th>ISBN</th>
-            <th>Title</th>
-            <th>Price</th>
-        </tr>
-        </table>
-    """, """
-        <col/>
-        <col/>
-    """)
+    """, "<caption>Table 1. This shows the possible results of flipping two coins.</caption>")
+
+
+# def test_html_whitelist_colgroup():
+#     """The colgroup element groups col elements inside its parent table."""
+#     check_html_output_contains_text("""
+#         <table>
+#         <colgroup>
+#             <col span="2" style="background-color:red"/>
+#             <col style="background-color:yellow"/>
+#         </colgroup>
+#         <tr>
+#             <th>ISBN</th>
+#             <th>Title</th>
+#             <th>Price</th>
+#         </tr>
+#         </table>
+#     """, """
+#         <colgroup>
+#             <col/>
+#             <col/>
+#         </colgroup>
+#     """)
+
+
+# def test_html_whitelist_col():
+#     """The col element describes one or more columns in a table."""
+#     check_html_output_contains_text("""
+#         <table>
+#         <colgroup>
+#             <col span="2" style="background-color:red"/>
+#             <col style="background-color:yellow"/>
+#         </colgroup>
+#         <tr>
+#             <th>ISBN</th>
+#             <th>Title</th>
+#             <th>Price</th>
+#         </tr>
+#         </table>
+#     """, """
+#         <col/>
+#         <col/>
+#     """)
 
 
 def test_html_whitelist_div():
@@ -367,11 +357,13 @@ def test_html_whitelist_table():
     """The table element represents data with more than one dimension."""
     check_html_output_contains_text("""
         <table>
-        <tr>
-            <td>Table contents</td>
-        </tr>
+        <tbody>
+            <tr>
+                <td>Table contents</td>
+            </tr>
+        </tbody>
         </table>
-    """, "<table><tr><td>Table contents</td></tr></table>")
+    """, "<table><tbody><tr><td>Table contents</td></tr></tbody></table>")
 
 
 def test_html_whitelist_tbody():
@@ -379,10 +371,12 @@ def test_html_whitelist_tbody():
     check_html_output_contains_text("""
         <table>
         <tbody>
-            <td>Table body content</td>
+            <tr>
+                <td>Table body content</td>
+            </tr>
         </tbody>
         </table>
-    """, "<tbody><td>Table body content</td></tbody>")
+    """, "<tbody><tr><td>Table body content</td></tr></tbody>")
 
 
 def test_html_whitelist_thead():
@@ -911,15 +905,6 @@ def test_html_blacklist_link():
     """, "link")
 
 
-def test_html_blacklist_time():
-    """The time element has a time and a machine-readable datetime."""
-    check_html_output_does_not_contain_tag("""
-        <p>
-            We open at <time datetime="2018-11-21 10:00">10:00 tomorrow</time>.
-        </p>
-    """, "time")
-
-
 def test_html_blacklist_style():
     """The style element embeds style information in the document."""
     check_html_output_does_not_contain_tag("""
@@ -992,8 +977,8 @@ def test_html_special_sup():
 @mark.parametrize("element", ["a", "abbr", "address", "b", "bdi", "bdo",
                               "cite", "code", "del", "dfn", "em", "i", "ins",
                               "kbs", "mark", "rb", "ruby", "rp", "rt", "rtc",
-                              "s", "samp", "small", "span", "strong", "u",
-                              "var", "wbr"])
+                              "s", "samp", "small", "span", "strong", "time",
+                              "u", "var", "wbr"])
 def test_html_remaining_element(element):
     """Simple standalone elements which can contain text.
        Check that the inner text is kept and the tag is discarded."""