diff --git a/tap_github/scraping.py b/tap_github/scraping.py index f3f043b8..35be8b61 100644 --- a/tap_github/scraping.py +++ b/tap_github/scraping.py @@ -93,9 +93,12 @@ def _scrape_dependents(url: str, logger: logging.Logger) -> Iterable[Dict[str, A url = "" -def parse_counter( - tag: Union[Tag, NavigableString, None], logger: logging.Logger -) -> int: +def parse_counter(tag: Union[Tag, NavigableString, None]) -> int: + """ + Extract a count of [issues|PR|contributors...] from an HTML tag. + For very high numbers, we only get an approximate value as github + does not provide the actual number. + """ if not tag: return 0 try: @@ -106,8 +109,8 @@ def parse_counter( title_string = cast(str, title) else: title_string = cast(str, title[0]) - return int(title_string.strip().replace(",", "")) - except KeyError: + return int(title_string.strip().replace(",", "").replace("+", "")) + except (KeyError, ValueError): raise IndexError( f"Could not parse counter {tag}. Maybe the GitHub page format has changed?" ) @@ -123,10 +126,8 @@ def scrape_metrics( soup = BeautifulSoup(response.content, "html.parser") try: - issues = parse_counter(soup.find("span", id="issues-repo-tab-count"), logger) - prs = parse_counter( - soup.find("span", id="pull-requests-repo-tab-count"), logger - ) + issues = parse_counter(soup.find("span", id="issues-repo-tab-count")) + prs = parse_counter(soup.find("span", id="pull-requests-repo-tab-count")) except IndexError: # These two items should exist. We raise an error if we could not find them. raise IndexError( @@ -140,9 +141,7 @@ def scrape_metrics( dependents: int = 0 if dependents_node_parent is not None: if dependents_node_parent["href"].endswith("/network/dependents"): - dependents = parse_counter( - getattr(dependents_node, "next_element", None), logger - ) + dependents = parse_counter(getattr(dependents_node, "next_element", None)) # likewise, handle edge cases with contributors contributors_node = soup.find(text=contributors_regex) @@ -152,7 +151,6 @@ def scrape_metrics( if contributors_node_parent["href"].endswith("/graphs/contributors"): contributors = parse_counter( getattr(contributors_node, "next_element", None), - logger, ) fetched_at = datetime.now(tz=timezone.utc) diff --git a/tap_github/tests/test_tap.py b/tap_github/tests/test_tap.py index 4004c904..1ffb0faf 100644 --- a/tap_github/tests/test_tap.py +++ b/tap_github/tests/test_tap.py @@ -5,10 +5,12 @@ from unittest.mock import patch import pytest +from bs4 import BeautifulSoup from dateutil.parser import isoparse from singer_sdk._singerlib import Catalog from singer_sdk.helpers import _catalog as cat_helpers +from tap_github.scraping import parse_counter from tap_github.tap import TapGitHub from .fixtures import alternative_sync_chidren, repo_list_config, username_list_config @@ -165,3 +167,30 @@ def test_get_a_user_in_user_usernames_mode( assert '{"username": "aaronsteers"' in captured_out assert '{"username": "aaRONsTeeRS"' not in captured_out assert '{"username": "EricBoucher"' not in captured_out + + +def test_web_tag_parse_counter(): + """ + Check that the parser runs ok on various forms of counters. + Used in extra_metrics stream. + """ + # regular int + tag = BeautifulSoup( + '57', + "html.parser", + ).span + assert parse_counter(tag) == 57 + + # 2k + tag = BeautifulSoup( + '2k', + "html.parser", + ).span + assert parse_counter(tag) == 2028 + + # 5k+. The real number is not available in the page, use this approx value + tag = BeautifulSoup( + '5k+', + "html.parser", + ).span + assert parse_counter(tag) == 5_000