diff --git a/tap_github/scraping.py b/tap_github/scraping.py
index f3f043b8..35be8b61 100644
--- a/tap_github/scraping.py
+++ b/tap_github/scraping.py
@@ -93,9 +93,12 @@ def _scrape_dependents(url: str, logger: logging.Logger) -> Iterable[Dict[str, A
url = ""
-def parse_counter(
- tag: Union[Tag, NavigableString, None], logger: logging.Logger
-) -> int:
+def parse_counter(tag: Union[Tag, NavigableString, None]) -> int:
+ """
+ Extract a count of [issues|PR|contributors...] from an HTML tag.
+ For very high numbers, we only get an approximate value as github
+ does not provide the actual number.
+ """
if not tag:
return 0
try:
@@ -106,8 +109,8 @@ def parse_counter(
title_string = cast(str, title)
else:
title_string = cast(str, title[0])
- return int(title_string.strip().replace(",", ""))
- except KeyError:
+ return int(title_string.strip().replace(",", "").replace("+", ""))
+ except (KeyError, ValueError):
raise IndexError(
f"Could not parse counter {tag}. Maybe the GitHub page format has changed?"
)
@@ -123,10 +126,8 @@ def scrape_metrics(
soup = BeautifulSoup(response.content, "html.parser")
try:
- issues = parse_counter(soup.find("span", id="issues-repo-tab-count"), logger)
- prs = parse_counter(
- soup.find("span", id="pull-requests-repo-tab-count"), logger
- )
+ issues = parse_counter(soup.find("span", id="issues-repo-tab-count"))
+ prs = parse_counter(soup.find("span", id="pull-requests-repo-tab-count"))
except IndexError:
# These two items should exist. We raise an error if we could not find them.
raise IndexError(
@@ -140,9 +141,7 @@ def scrape_metrics(
dependents: int = 0
if dependents_node_parent is not None:
if dependents_node_parent["href"].endswith("/network/dependents"):
- dependents = parse_counter(
- getattr(dependents_node, "next_element", None), logger
- )
+ dependents = parse_counter(getattr(dependents_node, "next_element", None))
# likewise, handle edge cases with contributors
contributors_node = soup.find(text=contributors_regex)
@@ -152,7 +151,6 @@ def scrape_metrics(
if contributors_node_parent["href"].endswith("/graphs/contributors"):
contributors = parse_counter(
getattr(contributors_node, "next_element", None),
- logger,
)
fetched_at = datetime.now(tz=timezone.utc)
diff --git a/tap_github/tests/test_tap.py b/tap_github/tests/test_tap.py
index 4004c904..1ffb0faf 100644
--- a/tap_github/tests/test_tap.py
+++ b/tap_github/tests/test_tap.py
@@ -5,10 +5,12 @@
from unittest.mock import patch
import pytest
+from bs4 import BeautifulSoup
from dateutil.parser import isoparse
from singer_sdk._singerlib import Catalog
from singer_sdk.helpers import _catalog as cat_helpers
+from tap_github.scraping import parse_counter
from tap_github.tap import TapGitHub
from .fixtures import alternative_sync_chidren, repo_list_config, username_list_config
@@ -165,3 +167,30 @@ def test_get_a_user_in_user_usernames_mode(
assert '{"username": "aaronsteers"' in captured_out
assert '{"username": "aaRONsTeeRS"' not in captured_out
assert '{"username": "EricBoucher"' not in captured_out
+
+
+def test_web_tag_parse_counter():
+ """
+ Check that the parser runs ok on various forms of counters.
+ Used in extra_metrics stream.
+ """
+ # regular int
+ tag = BeautifulSoup(
+ '57',
+ "html.parser",
+ ).span
+ assert parse_counter(tag) == 57
+
+ # 2k
+ tag = BeautifulSoup(
+ '2k',
+ "html.parser",
+ ).span
+ assert parse_counter(tag) == 2028
+
+ # 5k+. The real number is not available in the page, use this approx value
+ tag = BeautifulSoup(
+ '5k+',
+ "html.parser",
+ ).span
+ assert parse_counter(tag) == 5_000