Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check for disk space errors #121

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions icrawler/builtin/urllist.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ def worker_exec(self, queue_timeout=2, **kwargs):
if self.signal.get("reach_max_num"):
self.logger.info("downloaded image reached max num, thread %s" " exit", threading.current_thread().name)
break
if self.signal.get("exceed_storage_space"):
self.logger.info("downloaded image reached max storage space, thread %s" " exit", threading.current_thread().name)
break
try:
url = self.in_queue.get(timeout=queue_timeout)
except queue.Empty:
Expand Down
6 changes: 3 additions & 3 deletions icrawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,11 @@ def set_logger(self, log_level=logging.INFO):
def init_signal(self):
"""Init signal

3 signals are added: ``feeder_exited``, ``parser_exited`` and
``reach_max_num``.
4 signals are added: ``feeder_exited``, ``parser_exited``,
``reach_max_num`` and ``exceed_storage_space``.
"""
self.signal = Signal()
self.signal.set(feeder_exited=False, parser_exited=False, reach_max_num=False)
self.signal.set(feeder_exited=False, parser_exited=False, reach_max_num=False, exceed_storage_space=False)

def set_storage(self, storage):
"""Set storage backend for downloader
Expand Down
20 changes: 15 additions & 5 deletions icrawler/downloader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import queue
import errno
from io import BytesIO
from threading import current_thread
from urllib.parse import urlparse
Expand Down Expand Up @@ -113,7 +114,7 @@ def download(self, task, default_ext, timeout=5, max_retry=3, overwrite=False, *
return
self.fetched_num -= 1

while retry > 0 and not self.signal.get("reach_max_num"):
while retry > 0 and not self.signal.get("reach_max_num") and not self.signal.get("exceed_storage_space"):
try:
response = self.session.get(file_url, timeout=timeout)
except Exception as e:
Expand All @@ -135,10 +136,19 @@ def download(self, task, default_ext, timeout=5, max_retry=3, overwrite=False, *
with self.lock:
self.fetched_num += 1
filename = self.get_filename(task, default_ext)
self.logger.info("image #%s\t%s", self.fetched_num, file_url)
self.storage.write(filename, response.content)
task["success"] = True
task["filename"] = filename
self.logger.info("image #%s\t%s %s", self.fetched_num, filename, file_url)

task["success"] = False
try:
task["filename"] = filename # may be zero bytes if OSError happened during write()
self.storage.write(filename, response.content)
task["success"] = True
except OSError as o:
# errno.EINVAL -- name too long
if o.errno == errno.ENOSPC:
self.signal.set(exceed_storage_space=True)
else:
raise
break
finally:
retry -= 1
Expand Down
17 changes: 15 additions & 2 deletions icrawler/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ def worker_exec(self, queue_timeout=2, req_timeout=5, max_retry=3, **kwargs):
"downloaded image reached max num, thread %s " "is ready to exit", current_thread().name
)
break
if self.signal.get("exceed_storage_space"):
self.logger.info(
"no more storage space, thread %s " "is ready to exit", current_thread().name
)
break
# get the page url
try:
url = self.in_queue.get(timeout=queue_timeout)
Expand Down Expand Up @@ -91,8 +96,14 @@ def worker_exec(self, queue_timeout=2, req_timeout=5, max_retry=3, **kwargs):
)
else:
self.logger.info(f"parsing result page {url}")
for task in self.parse(response, **kwargs):
while not self.signal.get("reach_max_num"):
task_list = self.parse(response, **kwargs)
if not task_list:
self.logger.debug("self.parse() returned no tasks")
with open("task_list_error.log", 'ab') as f:
f.write(response.content)

for task in task_list:
while not self.signal.get("reach_max_num") and not self.signal.get("exceed_storage_space"):
try:
if isinstance(task, dict):
self.output(task, timeout=1)
Expand All @@ -111,6 +122,8 @@ def worker_exec(self, queue_timeout=2, req_timeout=5, max_retry=3, **kwargs):
break
if self.signal.get("reach_max_num"):
break
if self.signal.get("exceed_storage_space"):
break
self.in_queue.task_done()
break
finally:
Expand Down