Skip to content

Commit

Permalink
lockfile in download_decompress (#1332)
Browse files Browse the repository at this point in the history
* lockfile in download_decompress

* Revert "lockfile in download_decompress"

This reverts commit 6e67de8.

* refactor: download synchronization with filelock

* filelock logs and poll interval

* remove lockfile.unlink() cause of race condition

* feat: filelock log level increased

Co-authored-by: Fedor Ignatov <[email protected]>
  • Loading branch information
shuu01 and IgnatovFedor authored Nov 10, 2020
1 parent 3b78bab commit b552d0c
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 13 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,8 @@ download/

# project data
/data/

# local dockerfiles
/Dockerfile
/entrypoint.sh
/.dockerignore
1 change: 0 additions & 1 deletion deeppavlov/core/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ def simple_download(url: str, destination: Union[Path, str]) -> None:
return s3_download(url, str(destination))

chunk_size = 32 * 1024

temporary = destination.with_suffix(destination.suffix + '.part')

headers = {'dp-token': _get_download_token()}
Expand Down
28 changes: 16 additions & 12 deletions deeppavlov/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from logging import getLogger
from pathlib import Path
from typing import Union, Optional, Dict, Iterable, Set, Tuple, List
from urllib.parse import urlparse

import requests
from filelock import FileLock

import deeppavlov
from deeppavlov.core.commands.utils import expand_path, parse_config
Expand Down Expand Up @@ -124,19 +126,21 @@ def check_md5(url: str, dest_paths: List[Path]) -> bool:
return True


def download_resource(url: str, dest_paths: Iterable[Union[Path, str]]) \
-> None:
def download_resource(url: str, dest_paths: Iterable[Union[Path, str]]) -> None:
dest_paths = [Path(dest) for dest in dest_paths]

if check_md5(url, dest_paths):
log.info(f'Skipped {url} download because of matching hashes')
elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')):
download_path = dest_paths[0].parent
download_decompress(url, download_path, dest_paths)
else:
file_name = url.split('/')[-1].split('?')[0]
dest_files = [dest_path / file_name for dest_path in dest_paths]
download(dest_files, url)
download_path = dest_paths[0].parent
download_path.mkdir(parents=True, exist_ok=True)
file_name = urlparse(url).path.split('/')[-1]
lockfile = download_path / f'.{file_name}.lock'

with FileLock(lockfile).acquire(poll_intervall=10):
if check_md5(url, dest_paths):
log.info(f'Skipped {url} download because of matching hashes')
elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')):
download_decompress(url, download_path, dest_paths)
else:
dest_files = [dest_path / file_name for dest_path in dest_paths]
download(dest_files, url)


def download_resources(args: Namespace) -> None:
Expand Down
7 changes: 7 additions & 0 deletions deeppavlov/utils/settings/log_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
"uvicorn_handler"
],
"propagate": true
},
"filelock": {
"level": "WARNING",
"handlers": [
"stdout"
],
"propagate": true
}
},
"formatters": {
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
aio-pika==6.4.1
Cython==0.29.14
fastapi==0.47.1
filelock==3.0.12
h5py==2.10.0
nltk==3.4.5
numpy==1.18.0
Expand Down

0 comments on commit b552d0c

Please sign in to comment.