1
1
"""Define the module containing the function used to scrap data from the APD website."""
2
2
import asyncio
3
3
import datetime
4
+ from pathlib import Path
4
5
import re
5
6
from urllib .parse import urljoin
6
7
10
11
from tenacity import stop_after_attempt
11
12
from tenacity import wait_exponential
12
13
13
- from scrapd .core import date_utils
14
14
from scrapd .core import article
15
+ from scrapd .core import constant
16
+ from scrapd .core import date_utils
15
17
from scrapd .core import model
16
18
from scrapd .core import twitter
17
19
from scrapd .core .regex import match_pattern
@@ -125,7 +127,7 @@ def has_next(news_page):
125
127
return bool (element )
126
128
127
129
128
- def parse_page (page , url ):
130
+ def parse_page (page , url , dump = False ):
129
131
"""
130
132
Parse the page using all parsing methods available.
131
133
@@ -148,11 +150,19 @@ def parse_page(page, url):
148
150
article_err_str = f'\n Article fields:\n \t * ' + "\n \t * " .join (artricle_err ) if artricle_err else ''
149
151
logger .debug (f'Errors while parsing { url } :{ twitter_err_str } { article_err_str } ' )
150
152
153
+ # Dump the file.
154
+ if dump :
155
+ dumpr_dir = Path (constant .DUMP_DIR )
156
+ dumpr_dir .mkdir (parents = True , exist_ok = True )
157
+ dump_file_name = url .split ('/' )[- 1 ]
158
+ dump_file = dumpr_dir / dump_file_name
159
+ dump_file .write_text (page )
160
+
151
161
return report
152
162
153
163
154
164
@retry ()
155
- async def fetch_and_parse (session , url ):
165
+ async def fetch_and_parse (session , url , dump = False ):
156
166
"""
157
167
Parse a fatality page from a URL.
158
168
@@ -167,7 +177,7 @@ async def fetch_and_parse(session, url):
167
177
raise ValueError (f'The URL { url } returned a 0-length content.' )
168
178
169
179
# Parse it.
170
- report = parse_page (page , url )
180
+ report = parse_page (page , url , dump )
171
181
if not report :
172
182
raise ValueError (f'No data could be extracted from the page { url } .' )
173
183
@@ -177,13 +187,16 @@ async def fetch_and_parse(session, url):
177
187
return report
178
188
179
189
180
- async def async_retrieve (pages = - 1 , from_ = None , to = None , attempts = 1 , backoff = 1 ):
190
+ async def async_retrieve (pages = - 1 , from_ = None , to = None , attempts = 1 , backoff = 1 , dump = False ):
181
191
"""
182
192
Retrieve fatality data.
183
193
184
194
:param str pages: number of pages to retrieve or -1 for all
185
195
:param str from_: the start date
186
196
:param str to: the end date
197
+ :param int attempts: number of attempts per report
198
+ :param int backoff: initial backoff time (second)
199
+ :param bool dump: dump reports with parsing issues
187
200
:return: the list of fatalities and the number of pages that were read.
188
201
:rtype: tuple
189
202
"""
@@ -218,7 +231,7 @@ async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1):
218
231
stop = stop_after_attempt (attempts ),
219
232
wait = wait_exponential (multiplier = backoff ),
220
233
reraise = True ,
221
- )(session , link ) for link in links
234
+ )(session , link , dump ) for link in links
222
235
]
223
236
page_res = await asyncio .gather (* tasks )
224
237
0 commit comments