Skip to content

Commit 157beca

Browse files
rgreinhomergify[bot]
authored andcommitted
Add --dump option (scrapd#214)
Adds a `--dump` option to help developers debug parsing issues. The documentation was updated accordingly and the pytest settings were adjusted to handle the new marker.
1 parent 4b4e38f commit 157beca

File tree

10 files changed

+108
-13
lines changed

10 files changed

+108
-13
lines changed

.github/CONTRIBUTING.rst

+45
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,51 @@ with sudo and will prompt you for your password::
123123

124124
inv flame-graph
125125

126+
.. _contributing-dumping:
127+
128+
Dumping
129+
-------
130+
131+
`scrapd` comes with a `--dump` option, which will save the HTML content of the reports being parsed if they contains at
132+
least one parsing error either in the twitter fields or the article itself. The dumped files will be stored in a `.dump` directory
133+
134+
Workflow
135+
^^^^^^^^
136+
137+
Start by running `scrapd` at the root of this project::
138+
139+
scrapd -vvv --dump 1>.dump/dump.json 2>.dump/dump.json.log
140+
141+
In addition to the dumps, this will also create 2 files to help you debug:
142+
143+
* a `dump.json` containing the parsed reports in JSON (useful to double check the values)
144+
* a `dump.json.log` containing the parsing errors and the names of the files triggering them
145+
146+
.. warning::
147+
148+
You may encounter false positives. For intance some reports do not contain twitter fields, which will obviously
149+
trigger an error, but is not something we can act on.
150+
151+
Locate the test named `test_dumped_page` in the `tests/core/test_apd.py` file and update the test parameters with the
152+
name of the file you want to debug:
153+
154+
.. code-block:: python
155+
156+
@pytest.mark.parametrize('page_dump', [
157+
pytest.param('traffic-fatality-1-2', id='dumped'),
158+
])
159+
160+
.. note::
161+
162+
You can specify as many files as you want, by adding more `pytest.param` objects. This can be useful if you notice
163+
the same parsing error being reported in various files.
164+
165+
And finally, run pytest with the `dump` marker::
166+
167+
pytest -s -vvv -n0 -x -m dump
168+
169+
170+
126171

127172
.. _`Draft Pull Request`: https://github.blog/2019-02-14-introducing-draft-pull-requests/
128173
.. _`How to Write a Git Commit Message`: http://chris.beams.io/posts/git-commit

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
.cache
1414
.coverage
1515
.coverage.*
16+
.dump/
1617
.eggs/
1718
.idea/
1819
.installed.cfg

docs/source/usage.rst

+3
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ The log level can be adjusted by adding/removing `-v` flags:
5353

5454
For 2 `-v` and more, the log format also changes from compact to verbose.
5555

56+
The `dump` option is intended to be used by developpers only. If the parser encounters an error, it will dump the
57+
content of the HTML page on disk, into a `.dump` directory. See the :ref:`contributing-dumping` for more information.
58+
5659
docker
5760
------
5861

noxfile.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def run_pytest(session, *posargs):
161161

162162

163163
def run_pytest_units(session):
164-
run_pytest(session, '-m', 'not integrations')
164+
run_pytest(session, '-m', 'not integrations and not dump')
165165

166166

167167
def run_pytest_integrations(session):

scrapd/cli/cli.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
@click.command()
2525
@click.option('-a', '--attempts', type=click.INT, default=3, help='number of attempts per report', show_default=True)
2626
@click.option('-b', '--backoff', type=click.INT, default=3, help='initial backoff time (second)', show_default=True)
27+
@click.option('--dump', is_flag=True, help='dump reports with parsing issues', show_default=True)
2728
@click.option(
2829
'-f',
2930
'--format',
@@ -38,7 +39,7 @@
3839
@click.option('--to', help='end date')
3940
@click.option('-v', '--verbose', count=True, help='adjust the log level')
4041
@click.pass_context
41-
def cli(ctx, attempts, backoff, format_, from_, pages, to, verbose): # noqa: D403
42+
def cli(ctx, attempts, backoff, dump, format_, from_, pages, to, verbose): # noqa: D403
4243
"""Retrieve APD's traffic fatality reports."""
4344
ctx.obj = {**ctx.params}
4445
ctx.auto_envvar_prefix = 'VZ'
@@ -82,6 +83,7 @@ def _execute(self):
8283
self.args['to'],
8384
self.args['attempts'],
8485
self.args['backoff'],
86+
self.args['dump'],
8587
))
8688
result_count = len(results)
8789
logger.info(f'Total: {result_count}')

scrapd/core/apd.py

+19-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Define the module containing the function used to scrap data from the APD website."""
22
import asyncio
33
import datetime
4+
from pathlib import Path
45
import re
56
from urllib.parse import urljoin
67

@@ -10,8 +11,9 @@
1011
from tenacity import stop_after_attempt
1112
from tenacity import wait_exponential
1213

13-
from scrapd.core import date_utils
1414
from scrapd.core import article
15+
from scrapd.core import constant
16+
from scrapd.core import date_utils
1517
from scrapd.core import model
1618
from scrapd.core import twitter
1719
from scrapd.core.regex import match_pattern
@@ -125,7 +127,7 @@ def has_next(news_page):
125127
return bool(element)
126128

127129

128-
def parse_page(page, url):
130+
def parse_page(page, url, dump=False):
129131
"""
130132
Parse the page using all parsing methods available.
131133
@@ -148,11 +150,19 @@ def parse_page(page, url):
148150
article_err_str = f'\nArticle fields:\n\t * ' + "\n\t * ".join(artricle_err) if artricle_err else ''
149151
logger.debug(f'Errors while parsing {url}:{twitter_err_str}{article_err_str}')
150152

153+
# Dump the file.
154+
if dump:
155+
dumpr_dir = Path(constant.DUMP_DIR)
156+
dumpr_dir.mkdir(parents=True, exist_ok=True)
157+
dump_file_name = url.split('/')[-1]
158+
dump_file = dumpr_dir / dump_file_name
159+
dump_file.write_text(page)
160+
151161
return report
152162

153163

154164
@retry()
155-
async def fetch_and_parse(session, url):
165+
async def fetch_and_parse(session, url, dump=False):
156166
"""
157167
Parse a fatality page from a URL.
158168
@@ -167,7 +177,7 @@ async def fetch_and_parse(session, url):
167177
raise ValueError(f'The URL {url} returned a 0-length content.')
168178

169179
# Parse it.
170-
report = parse_page(page, url)
180+
report = parse_page(page, url, dump)
171181
if not report:
172182
raise ValueError(f'No data could be extracted from the page {url}.')
173183

@@ -177,13 +187,16 @@ async def fetch_and_parse(session, url):
177187
return report
178188

179189

180-
async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1):
190+
async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1, dump=False):
181191
"""
182192
Retrieve fatality data.
183193
184194
:param str pages: number of pages to retrieve or -1 for all
185195
:param str from_: the start date
186196
:param str to: the end date
197+
:param int attempts: number of attempts per report
198+
:param int backoff: initial backoff time (second)
199+
:param bool dump: dump reports with parsing issues
187200
:return: the list of fatalities and the number of pages that were read.
188201
:rtype: tuple
189202
"""
@@ -218,7 +231,7 @@ async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1):
218231
stop=stop_after_attempt(attempts),
219232
wait=wait_exponential(multiplier=backoff),
220233
reraise=True,
221-
)(session, link) for link in links
234+
)(session, link, dump) for link in links
222235
]
223236
page_res = await asyncio.gather(*tasks)
224237

scrapd/core/constant.py

+3
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,6 @@ class Fields():
2323
MIDDLE_NAME = 'middle'
2424
NOTES = 'notes'
2525
TIME = 'time'
26+
27+
28+
DUMP_DIR = '.dump'

setup.cfg

+2
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,5 @@ ignore = D106,D202,D203,D212,D213
6565

6666
[tool:pytest]
6767
addopts = --disable-pytest-warnings --disable-warnings -n auto
68+
markers =
69+
dump: uses dumped files to spot parsing issues

tests/core/test_apd.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
from tenacity import stop_after_attempt
1212

1313
from scrapd.core import apd
14+
from scrapd.core import article
15+
from scrapd.core import twitter
16+
from tests.test_common import load_dumped_page
1417
from tests.test_common import load_test_page
1518
from tests.test_common import TEST_DATA_DIR
1619

@@ -218,8 +221,22 @@ async def test_fetch_and_parse_01(page, mocker):
218221
await apd.fetch_and_parse(None, 'url')
219222

220223

221-
# This is an invalid deceased field due to the "born" keyword:
222-
# "Deceased: Felipe Ramirez, Hispanic male, born 9-16-93"
223-
def test_parse_page_00():
224-
"""."""
225-
pass
224+
@pytest.mark.parametrize('page_dump', [
225+
pytest.param('traffic-fatality-1-2', id='dumped'),
226+
])
227+
@pytest.mark.dump
228+
def test_dumped_page(page_dump):
229+
"""
230+
Helper test to allow debugging offline.
231+
232+
Run the following command: `pytest -s -n0 -x -vvv -m dump`
233+
"""
234+
try:
235+
page = load_dumped_page(page_dump)
236+
except FileNotFoundError:
237+
raise FileNotFoundError(f'Dump file "{page_dump}" not found: run "scrapd --dump" first.')
238+
else:
239+
twitter_report, twitter_err = twitter.parse(page)
240+
assert not twitter_err
241+
article_report, artricle_err = article.parse_content(page)
242+
assert not artricle_err

tests/test_common.py

+9
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
"""Define the common values and functions to run the tests."""
22
from pathlib import Path
33

4+
from scrapd.core import constant
5+
46
TEST_ROOT_DIR = Path(__file__).resolve().parent
57
TEST_DATA_DIR = TEST_ROOT_DIR / 'data'
8+
TEST_DUMP_DIR = TEST_ROOT_DIR.parent / constant.DUMP_DIR
69

710

811
def load_test_page(page):
@@ -11,6 +14,12 @@ def load_test_page(page):
1114
return page_fd.read_text()
1215

1316

17+
def load_dumped_page(page):
18+
"""Load a dumped page."""
19+
page_fd = TEST_DUMP_DIR / page
20+
return page_fd.read_text()
21+
22+
1423
def scenario_inputs(scenarios):
1524
"""Parse the scenarios and feed the data to the test function."""
1625
return [test_input[0] for test_input in scenarios]

0 commit comments

Comments
 (0)