Skip to content

Commit efff647

Browse files
Jerome-Hsiehpre-commit-ci[bot]KumoLiu
authored
enhance download_and_extract (#8216)
Fixes #5463 ### Description According to issue, the error messages are not very intuitive. I think maybe we can check if the file name matches the downloaded file’s base name before starting the download. If it doesn’t match, it will notify user. ### Types of changes <!--- Put an `x` in all the boxes that apply, and remove the not applicable items --> - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. --------- Signed-off-by: jerome_Hsieh <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: YunLiu <[email protected]>
1 parent e1e3d8e commit efff647

File tree

2 files changed

+40
-2
lines changed

2 files changed

+40
-2
lines changed

monai/apps/utils.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import json
1616
import logging
1717
import os
18+
import re
1819
import shutil
1920
import sys
2021
import tarfile
@@ -30,7 +31,9 @@
3031
from monai.config.type_definitions import PathLike
3132
from monai.utils import look_up_option, min_version, optional_import
3233

34+
requests, has_requests = optional_import("requests")
3335
gdown, has_gdown = optional_import("gdown", "4.7.3")
36+
BeautifulSoup, has_bs4 = optional_import("bs4", name="BeautifulSoup")
3437

3538
if TYPE_CHECKING:
3639
from tqdm import tqdm
@@ -298,6 +301,29 @@ def extractall(
298301
)
299302

300303

304+
def get_filename_from_url(data_url: str) -> str:
305+
"""
306+
Get the filename from the URL link.
307+
"""
308+
try:
309+
response = requests.head(data_url, allow_redirects=True)
310+
content_disposition = response.headers.get("Content-Disposition")
311+
if content_disposition:
312+
filename = re.findall('filename="?([^";]+)"?', content_disposition)
313+
if filename:
314+
return str(filename[0])
315+
if "drive.google.com" in data_url:
316+
response = requests.get(data_url)
317+
if "text/html" in response.headers.get("Content-Type", ""):
318+
soup = BeautifulSoup(response.text, "html.parser")
319+
filename_div = soup.find("span", {"class": "uc-name-size"})
320+
if filename_div:
321+
return str(filename_div.find("a").text)
322+
return _basename(data_url)
323+
except Exception as e:
324+
raise Exception(f"Error processing URL: {e}") from e
325+
326+
301327
def download_and_extract(
302328
url: str,
303329
filepath: PathLike = "",
@@ -327,7 +353,18 @@ def download_and_extract(
327353
be False.
328354
progress: whether to display progress bar.
329355
"""
356+
url_filename_ext = "".join(Path(get_filename_from_url(url)).suffixes)
357+
filepath_ext = "".join(Path(_basename(filepath)).suffixes)
358+
if filepath not in ["", "."]:
359+
if filepath_ext == "":
360+
new_filepath = Path(filepath).with_suffix(url_filename_ext)
361+
logger.warning(
362+
f"filepath={filepath}, which missing file extension. Auto-appending extension to: {new_filepath}"
363+
)
364+
filepath = new_filepath
365+
if filepath_ext and filepath_ext != url_filename_ext:
366+
raise ValueError(f"File extension mismatch: expected extension {url_filename_ext}, but get {filepath_ext}")
330367
with tempfile.TemporaryDirectory() as tmp_dir:
331-
filename = filepath or Path(tmp_dir, _basename(url)).resolve()
368+
filename = filepath or Path(tmp_dir, get_filename_from_url(url)).resolve()
332369
download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress)
333370
extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base)

tests/test_download_and_extract.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020
from parameterized import parameterized
2121

2222
from monai.apps import download_and_extract, download_url, extractall
23-
from tests.utils import skip_if_downloading_fails, skip_if_quick, testing_data_config
23+
from tests.utils import SkipIfNoModule, skip_if_downloading_fails, skip_if_quick, testing_data_config
2424

2525

26+
@SkipIfNoModule("requests")
2627
class TestDownloadAndExtract(unittest.TestCase):
2728

2829
@skip_if_quick

0 commit comments

Comments
 (0)