Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update html parser to work on very high values like 5k+ #206

Merged
merged 2 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 11 additions & 13 deletions tap_github/scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,12 @@ def _scrape_dependents(url: str, logger: logging.Logger) -> Iterable[Dict[str, A
url = ""


def parse_counter(
tag: Union[Tag, NavigableString, None], logger: logging.Logger
) -> int:
def parse_counter(tag: Union[Tag, NavigableString, None]) -> int:
"""
Extract a count of [issues|PR|contributors...] from an HTML tag.
For very high numbers, we only get an approximate value as github
does not provide the actual number.
"""
if not tag:
return 0
try:
Expand All @@ -106,8 +109,8 @@ def parse_counter(
title_string = cast(str, title)
else:
title_string = cast(str, title[0])
return int(title_string.strip().replace(",", ""))
except KeyError:
return int(title_string.strip().replace(",", "").replace("+", ""))
except (KeyError, ValueError):
raise IndexError(
f"Could not parse counter {tag}. Maybe the GitHub page format has changed?"
)
Expand All @@ -123,10 +126,8 @@ def scrape_metrics(
soup = BeautifulSoup(response.content, "html.parser")

try:
issues = parse_counter(soup.find("span", id="issues-repo-tab-count"), logger)
prs = parse_counter(
soup.find("span", id="pull-requests-repo-tab-count"), logger
)
issues = parse_counter(soup.find("span", id="issues-repo-tab-count"))
prs = parse_counter(soup.find("span", id="pull-requests-repo-tab-count"))
except IndexError:
# These two items should exist. We raise an error if we could not find them.
raise IndexError(
Expand All @@ -140,9 +141,7 @@ def scrape_metrics(
dependents: int = 0
if dependents_node_parent is not None:
if dependents_node_parent["href"].endswith("/network/dependents"):
dependents = parse_counter(
getattr(dependents_node, "next_element", None), logger
)
dependents = parse_counter(getattr(dependents_node, "next_element", None))

# likewise, handle edge cases with contributors
contributors_node = soup.find(text=contributors_regex)
Expand All @@ -152,7 +151,6 @@ def scrape_metrics(
if contributors_node_parent["href"].endswith("/graphs/contributors"):
contributors = parse_counter(
getattr(contributors_node, "next_element", None),
logger,
)

fetched_at = datetime.now(tz=timezone.utc)
Expand Down
29 changes: 29 additions & 0 deletions tap_github/tests/test_tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
from unittest.mock import patch

import pytest
from bs4 import BeautifulSoup
from dateutil.parser import isoparse
from singer_sdk._singerlib import Catalog
from singer_sdk.helpers import _catalog as cat_helpers

from tap_github.scraping import parse_counter
from tap_github.tap import TapGitHub

from .fixtures import alternative_sync_chidren, repo_list_config, username_list_config
Expand Down Expand Up @@ -165,3 +167,30 @@ def test_get_a_user_in_user_usernames_mode(
assert '{"username": "aaronsteers"' in captured_out
assert '{"username": "aaRONsTeeRS"' not in captured_out
assert '{"username": "EricBoucher"' not in captured_out


def test_web_tag_parse_counter():
"""
Check that the parser runs ok on various forms of counters.
Used in extra_metrics stream.
"""
# regular int
tag = BeautifulSoup(
'<span id="issues-repo-tab-count" title="57" class="Counter">57</span>',
"html.parser",
).span
assert parse_counter(tag) == 57

# 2k
tag = BeautifulSoup(
'<span id="issues-repo-tab-count" title="2028" class="Counter">2k</span>',
"html.parser",
).span
assert parse_counter(tag) == 2028

# 5k+. The real number is not available in the page, use this approx value
tag = BeautifulSoup(
'<span id="issues-repo-tab-count" title="5,000+" class="Counter">5k+</span>',
"html.parser",
).span
assert parse_counter(tag) == 5_000