diff --git a/.config/coveragerc b/.config/coveragerc new file mode 100644 index 0000000..3a3a813 --- /dev/null +++ b/.config/coveragerc @@ -0,0 +1,12 @@ +[report] +omit = + */run.py + */python?.?/* + */venv/* + */site-packages/* + */tests/* + *__init__* + */_version.py + +exclude_lines = + if __name__ == '__main__': diff --git a/.config/pre-commit-config.yaml b/.config/pre-commit-config.yaml new file mode 100644 index 0000000..ab31cf6 --- /dev/null +++ b/.config/pre-commit-config.yaml @@ -0,0 +1,33 @@ +default_language_version: + python: python3.12 +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + exclude: test_scraper_.*\.json + - id: check-ast + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.1 + hooks: + # Run the linter. + - id: ruff + args: [--config, .config/ruff.toml, --fix] + # Run the formatter. + - id: ruff-format + args: [--config, .config/ruff.toml] + - repo: https://github.com/astral-sh/uv-pre-commit + rev: 0.2.37 + hooks: + # Run the pip compile + - id: pip-compile + name: pip-compile requirements.txt + files: pyproject.toml + args: [ pyproject.toml, --resolver=backtracking, --upgrade, -q, + -o, requirements.txt ] + - id: pip-compile + name: pip-compile requirements-test.txt + files: pyproject.toml + args: [ pyproject.toml, --resolver=backtracking, --upgrade, -q, + --extra, test, -c, requirements.txt, -o, requirements-test.txt ] diff --git a/.config/pytest.ini b/.config/pytest.ini new file mode 100644 index 0000000..bb28fd3 --- /dev/null +++ b/.config/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +pythonpath = ../src +addopts = "--color=yes" +log_cli = 1 diff --git a/.config/ruff.toml b/.config/ruff.toml new file mode 100644 index 0000000..922bb80 --- /dev/null +++ b/.config/ruff.toml @@ -0,0 +1,14 @@ +line-length = 79 +exclude = ["_version.py"] + +[lint] +# List of rules: https://docs.astral.sh/ruff/rules/ +select = [ + "E", # pycodestyle - default + "F", # pyflakes - default + "I" # isort +] + +[lint.isort] +known-local-folder = ["hdx.scraper.dcc"] +known-third-party = ["hdx.api", "hdx.location", "hdx.data", "hdx.database", "hdx.facades", "hdx.scraper", "hdx.utilities"] diff --git a/.github/workflows/run-python-script.yaml b/.github/workflows/run-python-script.yaml new file mode 100644 index 0000000..52b3df4 --- /dev/null +++ b/.github/workflows/run-python-script.yaml @@ -0,0 +1,45 @@ +# This workflow will install Python dependencies and run the script + +name: Run script + +on: + workflow_dispatch: # add run button in github + schedule: + - cron: "32 10 * * *" + +jobs: + run: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.x + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install . + - name: Run script + env: + HDX_SITE: ${{ vars.HDX_SITE }} + HDX_KEY: ${{ secrets.HDX_BOT_SCRAPERS_API_TOKEN }} + PREPREFIX: ${{ secrets.HDX_PIPELINE_PREPREFIX }} + USER_AGENT: ${{ vars.USER_AGENT }} + EXTRA_PARAMS: ${{ vars.EXTRA_PARAMS }} + run: | + python -m hdx.scraper.{{cookiecutter.scraper_name}} + - name: Send mail + if: failure() + uses: dawidd6/action-send-mail@v3 + with: + server_address: ${{secrets.HDX_PIPELINE_EMAIL_SERVER}} + server_port: ${{secrets.HDX_PIPELINE_EMAIL_PORT}} + username: ${{secrets.HDX_PIPELINE_EMAIL_USERNAME}} + password: ${{secrets.HDX_PIPELINE_EMAIL_PASSWORD}} + subject: "FAILED: ${{github.repository}} run job" + body: GitHub Actions run job for ${{github.repository}} failed! + to: ${{secrets.HDX_PIPELINE_EMAIL_LIST}} + from: ${{secrets.HDX_PIPELINE_EMAIL_FROM}} diff --git a/.github/workflows/run-python-tests.yaml b/.github/workflows/run-python-tests.yaml new file mode 100644 index 0000000..53f60ac --- /dev/null +++ b/.github/workflows/run-python-tests.yaml @@ -0,0 +1,49 @@ +# This workflow will install Python dependencies, lint and run tests +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Run tests + +on: + workflow_dispatch: # add run button in github + push: + branches-ignore: + - gh-pages + - 'dependabot/**' + pull_request: + branches-ignore: + - gh-pages + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install Hatch + uses: pypa/hatch@install + - name: Test with hatch/pytest + env: + HDX_KEY_TEST: ${{ secrets.HDX_BOT_SCRAPERS_API_TOKEN }} + GSHEET_AUTH: ${{ secrets.GSHEET_AUTH }} + run: | + hatch test + - name: Check styling + if: always() + run: | + hatch fmt --check + - name: Publish Unit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + junit_files: test-results.xml + - name: Publish in Coveralls + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + flag-name: tests + format: lcov diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b94498d --- /dev/null +++ b/.gitignore @@ -0,0 +1,173 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +## Project + +# Directory where the scraper caches data +saved_data/ + +# Version file +**/_version.py + +# Mac files +.DS_Store diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3c8d5c6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 UN-OCHA Humanitarian Data Exchange Project + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..44462c3 --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +# Collector for dcc Datasets +[![Build Status](https://github.com/OCHA-DAP/hdx-scraper-dcc/actions/workflows/run-python-tests.yaml/badge.svg)](https://github.com/OCHA-DAP/hdx-scraper-dcc/actions/workflows/run-python-tests.yaml) +[![Coverage Status](https://coveralls.io/repos/github/OCHA-DAP/hdx-scraper-dcc/badge.svg?branch=main&ts=1)](https://coveralls.io/github/OCHA-DAP/hdx-scraper-dcc?branch=main) + +This script ... + +## Development + +### Environment + +Development is currently done using Python 3.12. We recommend using a virtual +environment such as ``venv``: + + python3.12 -m venv venv + source venv/bin/activate + +In your virtual environment, please install all packages for +development by running: + + pip install -r requirements.txt + +### Installing and running + + +For the script to run, you will need to have a file called +.hdx_configuration.yaml in your home directory containing your HDX key, e.g.: + + hdx_key: "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" + hdx_read_only: false + hdx_site: prod + + You will also need to supply the universal .useragents.yaml file in your home + directory as specified in the parameter *user_agent_config_yaml* passed to + facade in run.py. The collector reads the key **hdx-scraper-dcc** as specified + in the parameter *user_agent_lookup*. + + Alternatively, you can set up environment variables: `USER_AGENT`, `HDX_KEY`, +`HDX_SITE`, `EXTRA_PARAMS`, `TEMP_DIR`, and `LOG_FILE_ONLY`. + +To install and run, execute: + + pip install . + python -m hdx.scraper.dcc + +## Environment + +Development is currently done using Python 3.11. We recommend using a virtual +environment such as ``venv``: + + python3.12 -m venv venv + source venv/bin/activate + +### Pre-commit + +Be sure to install `pre-commit`, which is run every time +you make a git commit: + +```shell +pip install pre-commit +pre-commit install +``` + +The configuration file for this project is in a +non-start location. Thus, you will need to edit your +`.git/hooks/pre-commit` file to reflect this. Change +the first line that begins with `ARGS` to: + + ARGS=(hook-impl --config=.config/pre-commit-config.yaml --hook-type=pre-commit) + +With pre-commit, all code is formatted according to +[black]("https://github.com/psf/black") and +[ruff]("https://github.com/charliermarsh/ruff") guidelines. + +To check if your changes pass pre-commit without committing, run: + + pre-commit run --all-files --config=.config/pre-commit-config.yaml + +### Testing + +Ensure you have the required packages to run the tests: + + pip install -r requirements-test.txt + +To run the tests and view coverage, execute: + +` pytest -c .config/pytest.ini --cov hdx --cov-config .config/coveragerc +` +### Packages + +[pip-tools](https://github.com/jazzband/pip-tools) is used for +package management. If you’ve introduced a new package to the +source code please add it to the `dependencies` section of +`pyproject.toml` with any known version constraints. + +For adding packages for testing, add them to +the `test` sections under `[project.optional-dependencies]`. + +Any changes to the dependencies will be automatically reflected in +`requirements.txt` and `requirements-test.txt` with `pre-commit`, +but you can re-generate the file without committing by executing: + + pre-commit run pip-compile --all-files --config=.config/pre-commit-config.yaml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..31de1c6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,71 @@ +######################### +# Project Configuration # +######################### + +# Project name and version needed to run tests + +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "hdx-scraper-dcc" +requires-python = ">=3.12" +dependencies = [ + "hdx-python-api", + "hdx-python-utilities", +] + +dynamic = ["version"] + +[project.optional-dependencies] +test = [ + "pytest", + "pytest-cov" +] +dev = ["pre-commit"] + +######### +# Hatch # +######### + +# Build + +[tool.hatch.build.targets.wheel] +packages = ["src/hdx"] + +[tool.hatch.build.hooks.vcs] +version-file = "src/hdx/scraper/dcc/_version.py" + +[tool.hatch.metadata] +allow-direct-references = true + +# Versioning + +[tool.hatch.version] +source = "vcs" + +[tool.hatch.version.raw-options] +local_scheme = "no-local-version" +version_scheme = "python-simplified-semver" + +# Tests + +[tool.hatch.envs.hatch-test] +features = ["test"] + +[[tool.hatch.envs.hatch-test.matrix]] +python = ["3.12"] + +[tool.hatch.envs.hatch-test.scripts] +run = """ + pytest -c .config/pytest.ini --rootdir=. --junitxml=test-results.xml \ + --cov --cov-config=.config/coveragerc --no-cov-on-fail \ + --cov-report=lcov --cov-report=term-missing + """ + +[tool.hatch.envs.hatch-static-analysis.scripts] +format-check = ["ruff format --config .config/ruff.toml --check --diff {args:.}",] +format-fix = ["ruff format --config .config/ruff.toml {args:.}",] +lint-check = ["ruff check --config .config/ruff.toml {args:.}",] +lint-fix = ["ruff check --config .config/ruff.toml --fix {args:.}",] diff --git a/src/hdx/scraper/dcc/__init__.py b/src/hdx/scraper/dcc/__init__.py new file mode 100644 index 0000000..d986e5b --- /dev/null +++ b/src/hdx/scraper/dcc/__init__.py @@ -0,0 +1 @@ +from ._version import version as __version__ # noqa: F401 diff --git a/src/hdx/scraper/dcc/__main__.py b/src/hdx/scraper/dcc/__main__.py new file mode 100755 index 0000000..7ff18e0 --- /dev/null +++ b/src/hdx/scraper/dcc/__main__.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +""" +Top level script. Calls other functions that generate datasets that this +script then creates in HDX. + +""" + +import logging +from os.path import dirname, expanduser, join + +from hdx.api.configuration import Configuration +from hdx.facades.infer_arguments import facade +from hdx.utilities.downloader import Download +from hdx.utilities.path import ( + wheretostart_tempdir_batch, +) +from hdx.utilities.retriever import Retrieve + +logger = logging.getLogger(__name__) + +_USER_AGENT_LOOKUP = "hdx-scraper-dcc" +_SAVED_DATA_DIR = "saved_data" # Keep in repo to avoid deletion in /tmp +_UPDATED_BY_SCRIPT = "HDX Scraper: dcc" + + +def main( + save: bool = True, + use_saved: bool = False, +) -> None: + """Generate datasets and create them in HDX + + Args: + save (bool): Save downloaded data. Defaults to True. + use_saved (bool): Use saved data. Defaults to False. + + Returns: + None + """ + with wheretostart_tempdir_batch(folder=_USER_AGENT_LOOKUP) as info: + temp_dir = info["folder"] + with Download() as downloader: + retriever = Retrieve( + downloader=downloader, + fallback_dir=temp_dir, + saved_dir=_SAVED_DATA_DIR, + temp_dir=temp_dir, + save=save, + use_saved=use_saved, + ) + configuration = Configuration.read() + # + # Steps to generate dataset + # + dataset.update_from_yaml( + path=join( + dirname(__file__), "config", "hdx_dataset_static.yaml" + ) + ) + dataset.create_in_hdx( + remove_additional_resources=True, + match_resource_order=False, + hxl_update=False, + updated_by_script=_UPDATED_BY_SCRIPT, + batch=info["batch"], + ) + + +if __name__ == "__main__": + facade( + main, + hdx_site="dev", + user_agent_config_yaml=join(expanduser("~"), ".useragents.yaml"), + user_agent_lookup=_USER_AGENT_LOOKUP, + project_config_yaml=join( + dirname(__file__), "config", "project_configuration.yaml"), + ) diff --git a/src/hdx/scraper/dcc/config/hdx_dataset_static.yaml b/src/hdx/scraper/dcc/config/hdx_dataset_static.yaml new file mode 100755 index 0000000..9c2fc18 --- /dev/null +++ b/src/hdx/scraper/dcc/config/hdx_dataset_static.yaml @@ -0,0 +1,10 @@ +license_id: cc-by +methodology: Other +caveats: None +dataset_source: Data for Children Collaborative +package_creator: HDX Data Systems Team +private: False +maintainer: 71421920-fdc8-40fb-ac97-99b85e90b8a7 +owner_org: dataset organization HDX ID +data_update_frequency: -2 +notes: dataset notes diff --git a/src/hdx/scraper/dcc/config/project_configuration.yaml b/src/hdx/scraper/dcc/config/project_configuration.yaml new file mode 100755 index 0000000..8e3c36e --- /dev/null +++ b/src/hdx/scraper/dcc/config/project_configuration.yaml @@ -0,0 +1 @@ +# Collector specific configuration diff --git a/src/hdx/scraper/dcc/dcc.py b/src/hdx/scraper/dcc/dcc.py new file mode 100755 index 0000000..7685237 --- /dev/null +++ b/src/hdx/scraper/dcc/dcc.py @@ -0,0 +1,55 @@ +#!/usr/bin/python +"""dcc scraper""" + +import logging +from typing import List, Optional + + +from hdx.api.configuration import Configuration +from hdx.data.dataset import Dataset +from hdx.data.hdxobject import HDXError +from hdx.utilities.retriever import Retrieve + +logger = logging.getLogger(__name__) + + +class dcc: + + def __init__( + self, configuration: Configuration, retriever: Retrieve, temp_dir: str + ): + self._configuration = configuration + self._retriever = retriever + self._temp_dir = temp_dir + + + def generate_dataset(self) -> Optional[Dataset]: + + # To be generated + dataset_name = None + dataset_title = None + dataset_time_period = None + dataset_tags = None + dataset_country_iso3 = None + + # Dataset info + dataset = Dataset( + { + "name": dataset_name, + "title": dataset_title, + } + ) + + dataset.set_time_period(dataset_time_period) + dataset.add_tagsa(dataset_tags) + # Only if needed + dataset.set_subnational(True) + try: + dataset.add_country_location(dataset_country_iso3) + except HDXError: + logger.error(f"Couldn't find country {dataset_country_iso3}, skipping") + return + + # Add resources here + + return dataset diff --git a/tests/fixtures/.deleteme b/tests/fixtures/.deleteme new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/input/.deleteme b/tests/fixtures/input/.deleteme new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_dcc.py b/tests/test_dcc.py new file mode 100644 index 0000000..d4e2198 --- /dev/null +++ b/tests/test_dcc.py @@ -0,0 +1,58 @@ +from os.path import join + +import pytest +from hdx.api.configuration import Configuration +from hdx.utilities.downloader import Download +from hdx.utilities.path import temp_dir +from hdx.utilities.retriever import Retrieve +from hdx.utilities.useragent import UserAgent + + +class Testdcc: + @pytest.fixture(scope="function") + def configuration(self, config_dir): + UserAgent.set_global("test") + Configuration._create( + hdx_read_only=True, + hdx_site="prod", + project_config_yaml=join(config_dir, "project_configuration.yaml"), + ) + return Configuration.read() + + @pytest.fixture(scope="class") + def fixtures_dir(self): + return join("tests", "fixtures") + + @pytest.fixture(scope="class") + def input_dir(self, fixtures_dir): + return join(fixtures_dir, "input") + + @pytest.fixture(scope="class") + def config_dir(self, fixtures_dir): + return join("src", "hdx", "scraper", "dcc", "config") + + def test_dcc( + self, + configuration, + fixtures_dir, + input_dir, + config_dir + ): + with temp_dir( + "Testdcc", + delete_on_success=True, + delete_on_failure=False, + ) as tempdir: + with Download(user_agent="test") as downloader: + retriever = Retrieve( + downloader=downloader, + fallback_dir=tempdir, + saved_dir=input_dir, + temp_dir=tempdir, + save=False, + use_saved=True, + ) + + dataset.update_from_yaml( + path=join(config_dir, "hdx_dataset_static.yaml") + )