Add --dump option (scrapd#214)

rgreinho · mergify[bot] · commit 157beca7c59d · 2019-10-07T14:23:30.000Z
Adds a `--dump` option to help developers debug parsing issues.

The documentation was updated accordingly and the pytest settings were
adjusted to handle the new marker.
diff --git a/.github/CONTRIBUTING.rst b/.github/CONTRIBUTING.rst
@@ -123,6 +123,51 @@ with sudo and will prompt you for your password::
 
   inv flame-graph
 
+.. _contributing-dumping:
+
+Dumping
+-------
+
+`scrapd` comes with a `--dump` option, which will save the HTML content of the reports being parsed if they contains at
+least one parsing error either in the twitter fields or the article itself. The dumped files will be stored in a `.dump` directory
+
+Workflow
+^^^^^^^^
+
+Start by running `scrapd` at the root of this project::
+
+  scrapd -vvv --dump  1>.dump/dump.json 2>.dump/dump.json.log
+
+In addition to the dumps, this will also create 2 files to help you debug:
+
+* a `dump.json` containing the parsed reports in JSON (useful to double check the values)
+* a `dump.json.log` containing the parsing errors and the names of the files triggering them
+
+.. warning::
+
+  You may encounter false positives. For intance some reports do not contain twitter fields, which will obviously
+  trigger an error, but is not something we can act on.
+
+Locate the test named `test_dumped_page` in the `tests/core/test_apd.py` file and update the test parameters with the
+name of the file you want to debug:
+
+.. code-block:: python
+
+  @pytest.mark.parametrize('page_dump', [
+    pytest.param('traffic-fatality-1-2', id='dumped'),
+  ])
+
+.. note::
+
+  You can specify as many files as you want, by adding more `pytest.param` objects. This can be useful if you notice
+  the same parsing error being reported in various files.
+
+And finally, run pytest with the `dump` marker::
+
+  pytest -s -vvv -n0 -x -m dump
+
+
+
 
 .. _`Draft Pull Request`: https://github.blog/2019-02-14-introducing-draft-pull-requests/
 .. _`How to Write a Git Commit Message`: http://chris.beams.io/posts/git-commit
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@
 .cache
 .coverage
 .coverage.*
+.dump/
 .eggs/
 .idea/
 .installed.cfg
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -53,6 +53,9 @@ The log level can be adjusted by adding/removing `-v` flags:
 
 For 2 `-v` and more, the log format also changes from compact to verbose.
 
+The `dump` option is intended to be used by developpers only. If the parser encounters an error, it will dump the
+content of the HTML page on disk, into a `.dump` directory. See the :ref:`contributing-dumping` for more information.
+
 docker
 ------
 
diff --git a/noxfile.py b/noxfile.py
@@ -161,7 +161,7 @@ def run_pytest(session, *posargs):
 
 
 def run_pytest_units(session):
-    run_pytest(session, '-m', 'not integrations')
+    run_pytest(session, '-m', 'not integrations and not dump')
 
 
 def run_pytest_integrations(session):
diff --git a/scrapd/cli/cli.py b/scrapd/cli/cli.py
@@ -24,6 +24,7 @@
 @click.command()
 @click.option('-a', '--attempts', type=click.INT, default=3, help='number of attempts per report', show_default=True)
 @click.option('-b', '--backoff', type=click.INT, default=3, help='initial backoff time (second)', show_default=True)
+@click.option('--dump', is_flag=True, help='dump reports with parsing issues', show_default=True)
 @click.option(
     '-f',
     '--format',
@@ -38,7 +39,7 @@
 @click.option('--to', help='end date')
 @click.option('-v', '--verbose', count=True, help='adjust the log level')
 @click.pass_context
-def cli(ctx, attempts, backoff, format_, from_, pages, to, verbose):  # noqa: D403
+def cli(ctx, attempts, backoff, dump, format_, from_, pages, to, verbose):  # noqa: D403
     """Retrieve APD's traffic fatality reports."""
     ctx.obj = {**ctx.params}
     ctx.auto_envvar_prefix = 'VZ'
@@ -82,6 +83,7 @@ def _execute(self):
                 self.args['to'],
                 self.args['attempts'],
                 self.args['backoff'],
+                self.args['dump'],
             ))
         result_count = len(results)
         logger.info(f'Total: {result_count}')
diff --git a/scrapd/core/apd.py b/scrapd/core/apd.py
@@ -1,6 +1,7 @@
 """Define the module containing the function used to scrap data from the APD website."""
 import asyncio
 import datetime
+from pathlib import Path
 import re
 from urllib.parse import urljoin
 
@@ -10,8 +11,9 @@
 from tenacity import stop_after_attempt
 from tenacity import wait_exponential
 
-from scrapd.core import date_utils
 from scrapd.core import article
+from scrapd.core import constant
+from scrapd.core import date_utils
 from scrapd.core import model
 from scrapd.core import twitter
 from scrapd.core.regex import match_pattern
@@ -125,7 +127,7 @@ def has_next(news_page):
     return bool(element)
 
 
-def parse_page(page, url):
+def parse_page(page, url, dump=False):
     """
     Parse the page using all parsing methods available.
 
@@ -148,11 +150,19 @@ def parse_page(page, url):
         article_err_str = f'\nArticle fields:\n\t * ' + "\n\t * ".join(artricle_err) if artricle_err else ''
         logger.debug(f'Errors while parsing {url}:{twitter_err_str}{article_err_str}')
 
+        # Dump the file.
+        if dump:
+            dumpr_dir = Path(constant.DUMP_DIR)
+            dumpr_dir.mkdir(parents=True, exist_ok=True)
+            dump_file_name = url.split('/')[-1]
+            dump_file = dumpr_dir / dump_file_name
+            dump_file.write_text(page)
+
     return report
 
 
 @retry()
-async def fetch_and_parse(session, url):
+async def fetch_and_parse(session, url, dump=False):
     """
     Parse a fatality page from a URL.
 
@@ -167,7 +177,7 @@ async def fetch_and_parse(session, url):
         raise ValueError(f'The URL {url} returned a 0-length content.')
 
     # Parse it.
-    report = parse_page(page, url)
+    report = parse_page(page, url, dump)
     if not report:
         raise ValueError(f'No data could be extracted from the page {url}.')
 
@@ -177,13 +187,16 @@ async def fetch_and_parse(session, url):
     return report
 
 
-async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1):
+async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1, dump=False):
     """
     Retrieve fatality data.
 
     :param str pages: number of pages to retrieve or -1 for all
     :param str from_: the start date
     :param str to: the end date
+    :param int attempts: number of attempts per report
+    :param int backoff: initial backoff time (second)
+    :param bool dump: dump reports with parsing issues
     :return: the list of fatalities and the number of pages that were read.
     :rtype: tuple
     """
@@ -218,7 +231,7 @@ async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1):
                     stop=stop_after_attempt(attempts),
                     wait=wait_exponential(multiplier=backoff),
                     reraise=True,
-                )(session, link) for link in links
+                )(session, link, dump) for link in links
             ]
             page_res = await asyncio.gather(*tasks)
 
diff --git a/scrapd/core/constant.py b/scrapd/core/constant.py
@@ -23,3 +23,6 @@ class Fields():
     MIDDLE_NAME = 'middle'
     NOTES = 'notes'
     TIME = 'time'
+
+
+DUMP_DIR = '.dump'
diff --git a/setup.cfg b/setup.cfg
@@ -65,3 +65,5 @@ ignore = D106,D202,D203,D212,D213
 
 [tool:pytest]
 addopts = --disable-pytest-warnings --disable-warnings -n auto
+markers =
+  dump: uses dumped files to spot parsing issues
diff --git a/tests/core/test_apd.py b/tests/core/test_apd.py
@@ -11,6 +11,9 @@
 from tenacity import stop_after_attempt
 
 from scrapd.core import apd
+from scrapd.core import article
+from scrapd.core import twitter
+from tests.test_common import load_dumped_page
 from tests.test_common import load_test_page
 from tests.test_common import TEST_DATA_DIR
 
@@ -218,8 +221,22 @@ async def test_fetch_and_parse_01(page, mocker):
         await apd.fetch_and_parse(None, 'url')
 
 
-# This is an invalid deceased field due to the "born" keyword:
-#   "Deceased:    Felipe Ramirez, Hispanic male, born 9-16-93"
-def test_parse_page_00():
-    """."""
-    pass
+@pytest.mark.parametrize('page_dump', [
+    pytest.param('traffic-fatality-1-2', id='dumped'),
+])
+@pytest.mark.dump
+def test_dumped_page(page_dump):
+    """
+    Helper test to allow debugging offline.
+
+    Run the following command: `pytest -s -n0 -x -vvv -m dump`
+    """
+    try:
+        page = load_dumped_page(page_dump)
+    except FileNotFoundError:
+        raise FileNotFoundError(f'Dump file "{page_dump}" not found: run "scrapd --dump" first.')
+    else:
+        twitter_report, twitter_err = twitter.parse(page)
+        assert not twitter_err
+        article_report, artricle_err = article.parse_content(page)
+        assert not artricle_err
diff --git a/tests/test_common.py b/tests/test_common.py
@@ -1,8 +1,11 @@
 """Define the common values and functions to run the tests."""
 from pathlib import Path
 
+from scrapd.core import constant
+
 TEST_ROOT_DIR = Path(__file__).resolve().parent
 TEST_DATA_DIR = TEST_ROOT_DIR / 'data'
+TEST_DUMP_DIR = TEST_ROOT_DIR.parent / constant.DUMP_DIR
 
 
 def load_test_page(page):
@@ -11,6 +14,12 @@ def load_test_page(page):
     return page_fd.read_text()
 
 
+def load_dumped_page(page):
+    """Load a dumped page."""
+    page_fd = TEST_DUMP_DIR / page
+    return page_fd.read_text()
+
+
 def scenario_inputs(scenarios):
     """Parse the scenarios and feed the data to the test function."""
     return [test_input[0] for test_input in scenarios]