Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run scrapers against live data sources #1059

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions .github/workflows/scheduled.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: Scheduled scraper tests

on:
pull_request: {}
workflow_dispatch: {}
schedule:
- cron: "0 0 * * 2" # every Tuesday at 00:00

jobs:
build:
runs-on: ubuntu-latest

defaults:
run:
working-directory: ./backend

services:
meilisearch:
image: "getmeili/meilisearch:v1.3.1"
ports: ["7700:7700"]
env:
MEILI_MASTER_KEY: "1234567890"

steps:
- name: Checkout repo
uses: actions/checkout@v4

- name: Install poetry
run: pipx install poetry

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: "poetry"
cache-dependency-path: "./backend/poetry.lock"

- name: Install dependencies
run: poetry install

- name: Run tests against live data sources
run: make test
env:
HTV_TEST_MOCK_REQUESTS: "false"
HTV_BACKEND_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/database.sqlite3"
HTV_BACKEND_USERS_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/users.sqlite3"
MEILI_MASTER_KEY: "1234567890"
MEILI_URL: "http://localhost:7700"
21 changes: 9 additions & 12 deletions backend/howtheyvote/scrapers/votes.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,10 +380,7 @@ def _procedure_reference(self, doc: BeautifulSoup) -> str | None:

class ProcedureScraper(BeautifulSoupScraper):
BS_PARSER = "lxml"
BASE_URL = (
"https://oeil.secure.europarl.europa.eu/"
"oeil/popups/ficheprocedure.do?lang=en&reference="
)
BASE_URL = "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference="

TITLE_PREFIXES = ["Resolution on", "Motion"]

Expand Down Expand Up @@ -437,23 +434,23 @@ def _title(self, doc: BeautifulSoup) -> str | None:
return normalized_title[:1].upper() + normalized_title[1:]

def _geo_areas(self, doc: BeautifulSoup) -> list[str]:
start = doc.select_one(
'#basic-information-data strong:-soup-contains("Geographical area")'
# The website unfortunately doesn't use semantic markup, so we have
# to rely on visual properties
wrapper = doc.select_one(
'#section1 p.font-weight-bold:-soup-contains("Geographical area") + p'
)

if not start:
if not wrapper:
return []

geo_areas = []

for sibling in start.next_siblings:
if isinstance(sibling, Tag) and sibling.name == "strong":
break
for node in wrapper.children:
country_name = node.get_text(strip=True)

if not sibling.get_text(strip=True):
if not country_name:
continue

country_name = sibling.get_text(strip=True)
country = Country.from_label(country_name, fuzzy=True)

if not country:
Expand Down
3 changes: 3 additions & 0 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ env = [
"HTV_BACKEND_PUBLIC_URL=https://example.org/api",
"HTV_SEARCH_INDEX_PREFIX=test",
]
markers = [
"always_mock_requests: Always mock HTTP requests, even when request mocks are disabled globally"
]
addopts = [
"--import-mode=importlib",
]
39 changes: 35 additions & 4 deletions backend/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
executed and rollback any changes after execution. In order to test API routes, we make use
of Flask’s built-in test client."""

import os

import pytest
import responses as responses_lib
from responses import FirstMatchRegistry, RequestsMock

from howtheyvote.db import Session, engine, migrate, session_factory
from howtheyvote.meili import configure_indexes, delete_indexes
Expand Down Expand Up @@ -55,8 +57,37 @@ def api(app):
yield app.test_client()


class DummyRegistry(FirstMatchRegistry):
"""A registry that ignores any requests that are added."""

def add(self, response):
return response


@pytest.fixture
def responses():
"""Allows mocking HTTP requests made with requests."""
with responses_lib.RequestsMock() as r:
def responses(request):
"""Allows mocking HTTP requests made with `requests`. Request mocking can be
disabled globally using the `HTV_TEST_MOCK_REQUESTS=false` env variable to
run tests against the live sources rather than test fixtures. Individual tests
can be marked using `always_mock_requests` to mock them even if requests mocks
are disabled globally. However, it’s preferred to write tests that can be run
against the live data sources."""

mock_requests = os.environ.get("HTV_TEST_MOCK_REQUESTS", "true").lower() in ["true", "1"]
marks = [m.name for m in request.node.iter_markers()]
always_mock_requests = "always_mock_requests" in marks

if always_mock_requests or mock_requests:
with RequestsMock() as r:
# Yield a "normal" requests mock that fails any request that isn’t explicitly mocked.
yield r
return

# When calling `responses.get("http://...", body="Lorem ipsum")` in a test to register
# a mock response, the mock is stored in a registry. When the tested then tries to send
# a matching request, `responses` tries to find a matching mock in the registry. To
# disable all mocks, we simply pass a dummy registry that never actually registers any
# mocks and allow all unmatched requests to pass to the original source.
with RequestsMock(registry=DummyRegistry) as r:
r.add_passthru("http")
yield r
1 change: 1 addition & 0 deletions backend/tests/pipelines/test_rcv_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from howtheyvote.pipelines import DataUnavailableError, RCVListPipeline


@pytest.mark.always_mock_requests
def test_run_source_not_available(responses, db_session):
with pytest.raises(DataUnavailableError):
pipe = RCVListPipeline(term=9, date=datetime.date(2024, 4, 10))
Expand Down
Loading