-
Notifications
You must be signed in to change notification settings - Fork 4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #265 from weingartlorenz/main
This is a component designed to download the Xview dataset
- Loading branch information
Showing
4 changed files
with
331 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
cwlVersion: v1.2 | ||
class: CommandLineTool | ||
|
||
baseCommand: "claimed" | ||
|
||
inputs: | ||
component: | ||
type: string | ||
default: docker.io/mdorzweiler/claimed-input-xview-download:0.1 | ||
inputBinding: | ||
position: 1 | ||
prefix: --component | ||
log_level: | ||
type: string | ||
default: "INFO" | ||
inputBinding: | ||
position: 2 | ||
prefix: --log_level | ||
username: | ||
type: string | ||
default: None | ||
inputBinding: | ||
position: 3 | ||
prefix: --username | ||
password: | ||
type: string | ||
default: None | ||
inputBinding: | ||
position: 4 | ||
prefix: --password | ||
move_to_dir: | ||
type: string | ||
default: None | ||
inputBinding: | ||
position: 5 | ||
prefix: --move_to_dir | ||
chromedriver_path: | ||
type: string | ||
default: None | ||
inputBinding: | ||
position: 6 | ||
prefix: --chromedriver_path | ||
max_download_time: | ||
type: string | ||
default: None | ||
inputBinding: | ||
position: 7 | ||
prefix: --max_download_time | ||
label: | ||
type: string | ||
default: None | ||
inputBinding: | ||
position: 8 | ||
prefix: --label | ||
|
||
|
||
outputs: [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "147f9480", | ||
"metadata": {}, | ||
"source": [ | ||
"## Xview Dataset Download \n", | ||
"\n", | ||
"This component is designed to download a labeled overhead image dataset, provided a chromedriver, to a specified location. \n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c185c1f0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install selenium" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "dc0554b5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"import os\n", | ||
"import shutil\n", | ||
"import time\n", | ||
"from selenium import webdriver\n", | ||
"from selenium.webdriver.common.by import By\n", | ||
"from selenium.webdriver.support.ui import WebDriverWait\n", | ||
"from selenium.webdriver.support import expected_conditions as EC\n", | ||
"from urllib.parse import urlparse\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "866d16c3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"# username for the Xview webpage to authorize login\n", | ||
"username = os.environ.get('username')\n", | ||
"\n", | ||
"# password for the Xview webpage to authorize login\n", | ||
"password = os.environ.get('password')\n", | ||
"\n", | ||
"# move_to_dir the directory where the dataset should be saved\n", | ||
"move_to_dir = os.environ.get('move_to_dir')\n", | ||
"\n", | ||
"# chromedriver_path the directory where the local copy of chromedriver is saved\n", | ||
"chromedriver_path = os.environ.get('chromedriver_path')\n", | ||
"\n", | ||
"# max_download_time before timeout, must be ajusted acording to the file size and internet speed\n", | ||
"max_download_time = os.environ.get('max_download_time')\n", | ||
"\n", | ||
"# The label of the file desired to download.\n", | ||
"# Chose from \"TI.zip\", \"TL.zip\", \"VI.zip\", \"TI.tgz\", \"TL.tgz\", \"VI.tgz, \n", | ||
"# standing for TI=Traning Images, TL=Training Lables, VI=Validation Images\n", | ||
"label = os.environ.get('label')\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "794506c5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"def login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label): \n", | ||
" \n", | ||
" # Set Chrome options to automatically download files to the specified directory\n", | ||
" options = webdriver.ChromeOptions()\n", | ||
" prefs = {\n", | ||
" \"download.default_directory\": move_to_dir,\n", | ||
" \"download.prompt_for_download\": False,\n", | ||
" \"download.directory_upgrade\": True,\n", | ||
" \"safebrowsing.enabled\": True\n", | ||
" }\n", | ||
" options.add_experimental_option(\"prefs\", prefs)\n", | ||
"\n", | ||
" # Start a new instance of Chrome web browser\n", | ||
" driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)\n", | ||
" \n", | ||
" # Open the login page\n", | ||
" url_login = r'https://challenge.xviewdataset.org/login'\n", | ||
" driver.get(url_login)\n", | ||
"\n", | ||
" # Find the username and password fields and enter credentials\n", | ||
" username_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'email')))\n", | ||
" password_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))\n", | ||
" username_field.send_keys(username)\n", | ||
" password_field.send_keys(password)\n", | ||
"\n", | ||
" # Find and click the login button\n", | ||
" login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn.primary')))\n", | ||
" login_button.click()\n", | ||
" \n", | ||
" # Wait for the page to load after login\n", | ||
" time.sleep(1)\n", | ||
" \n", | ||
" # Open the Download page\n", | ||
" url_download = r'https://challenge.xviewdataset.org/download-links'\n", | ||
" driver.get(url_download)\n", | ||
" \n", | ||
" # Wait for the overlay element to be present\n", | ||
" overlay_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'overlay--active')))\n", | ||
"\n", | ||
" # Remove the automaic pop-up overlay \n", | ||
" body_element = driver.find_element_by_tag_name('body')\n", | ||
" body_element.click()\n", | ||
" time.sleep(1)\n", | ||
" \n", | ||
" # Switch between the possible download files\n", | ||
" search_text = \"\"\n", | ||
" match label:\n", | ||
" case \"TI.zip\":\n", | ||
" search_text = '//a[contains(text(), \"Download Training Images (zip)\")]'\n", | ||
" case \"TL.zip\":\n", | ||
" search_text = '//a[contains(text(), \"Download Training Labels (zip)\")]'\n", | ||
" case \"VI.zip\":\n", | ||
" search_text = '//a[contains(text(), \"Download Validation Images (zip)\")]'\n", | ||
" case \"TI.tgz\":\n", | ||
" search_text = '//a[contains(text(), \"Download Training Images (tgz)\")]'\n", | ||
" case \"TL.tgz\":\n", | ||
" search_text = '//a[contains(text(), \"Download Training Labels (tgz)\")]'\n", | ||
" case \"VI.tgz\":\n", | ||
" search_text = '//a[contains(text(), \"Download Validation Images (tgz)\")]'\n", | ||
" case _:\n", | ||
" raise ValueError(\"Error: This is an invalid download option\") \n", | ||
" \n", | ||
" # Wait for the download link to be present\n", | ||
" download_link_element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, search_text)))\n", | ||
" \n", | ||
" # Get the dynamic download link from the href attribute\n", | ||
" download_link = download_link_element.get_attribute('href')\n", | ||
" \n", | ||
" # Download the dataset using the obtained link\n", | ||
" if download_link:\n", | ||
" driver.get(download_link)\n", | ||
" print(\"Dataset download started successfully.\")\n", | ||
" \n", | ||
" # Extract the filename from the download link URL\n", | ||
" parsed_url = urlparse(download_link)\n", | ||
" filename = parsed_url.path.split('/')[-1]\n", | ||
" downloaded_file = os.path.join(move_to_dir, filename)\n", | ||
" print(downloaded_file)\n", | ||
" \n", | ||
" # Check if the download directory exists\n", | ||
" if not os.path.exists(move_to_dir):\n", | ||
" os.makedirs(move_to_dir)\n", | ||
" \n", | ||
" # Wait for the file to be completely downloaded\n", | ||
" start_time = time.time()\n", | ||
" \n", | ||
" while True:\n", | ||
" if os.path.exists(downloaded_file) and os.path.getsize(downloaded_file) > 0:\n", | ||
" print(\"File downloaded successfully.\")\n", | ||
" break\n", | ||
" elif time.time() - start_time > max_download_time:\n", | ||
" print(\"Error: Maximum wait time exceeded.\")\n", | ||
" break\n", | ||
" else:\n", | ||
" time.sleep(5)\n", | ||
" \n", | ||
" else:\n", | ||
" print(\"Failed to get the download link.\")\n", | ||
"\n", | ||
" # Close the browser\n", | ||
" driver.quit()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "e7b2f96d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
apiVersion: batch/v1 | ||
kind: Job | ||
metadata: | ||
name: input-xview-download | ||
spec: | ||
template: | ||
spec: | ||
containers: | ||
- name: input-xview-download | ||
image: docker.io/mdorzweiler/claimed-input-xview-download:0.1 | ||
workingDir: /opt/app-root/src/ | ||
command: ["/opt/app-root/bin/ipython","claimed_input-Xview-download.ipynb"] | ||
env: | ||
- name: log_level | ||
value: value_of_log_level | ||
- name: username | ||
value: value_of_username | ||
- name: password | ||
value: value_of_password | ||
- name: move_to_dir | ||
value: value_of_move_to_dir | ||
- name: chromedriver_path | ||
value: value_of_chromedriver_path | ||
- name: max_download_time | ||
value: value_of_max_download_time | ||
- name: label | ||
value: value_of_label | ||
restartPolicy: OnFailure | ||
imagePullSecrets: | ||
- name: image_pull_secret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
name: input-xview-download | ||
description: "## Xview Dataset Download – CLAIMED V0.1" | ||
|
||
inputs: | ||
- {name: log_level, type: String, description: "update log level", default: "INFO"} | ||
- {name: username, type: String, description: "username for the Xview webpage to authorize login"} | ||
- {name: password, type: String, description: "password for the Xview webpage to authorize login"} | ||
- {name: move_to_dir, type: String, description: "move_to_dir the directory where the dataset should be saved"} | ||
- {name: chromedriver_path, type: String, description: "chromedriver_path the directory where the local copy of chromedriver is saved"} | ||
- {name: max_download_time, type: String, description: "max_download_time before timeout, must be ajusted acording to the file size and internet speed"} | ||
- {name: label, type: String, description: "standing for TI=Traning Images, TL=Training Lables, VI=Validation Images"} | ||
|
||
|
||
outputs: | ||
|
||
|
||
implementation: | ||
container: | ||
image: docker.io/mdorzweiler/claimed-input-xview-download:0.1 | ||
command: | ||
- sh | ||
- -ec | ||
- | | ||
ipython ./claimed_input-Xview-download.ipynb log_level="${0}" username="${1}" password="${2}" move_to_dir="${3}" chromedriver_path="${4}" max_download_time="${5}" label="${6}" | ||
- {inputValue: log_level} | ||
- {inputValue: username} | ||
- {inputValue: password} | ||
- {inputValue: move_to_dir} | ||
- {inputValue: chromedriver_path} | ||
- {inputValue: max_download_time} | ||
- {inputValue: label} |