From 3607a16fab201125f2799e6d93ab433b4d872880 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 13:38:40 -0400 Subject: [PATCH 1/2] Paginate backwards for stargazers --- tap_github/client.py | 6 ++-- tap_github/repository_streams.py | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 222f4ba3..9f110743 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -77,11 +77,11 @@ def get_next_page_token( else: results = resp_json.get("items") - # Exit early if the response has no items. ? Maybe duplicative the "next" link check. + # Exit early if the response has no items. ? Maybe duplicative of the "next" link check. if not results: return None - # Unfortunately endpoints such as /starred, /stargazers, /events and /pulls do not support + # Unfortunately endpoints such as /starred, /events and /pulls do not support # the "since" parameter out of the box. So we use a workaround here to exit early. # For such streams, we sort by descending dates (most recent first), and paginate # "back in time" until we reach records before our "since" parameter. @@ -132,7 +132,7 @@ def get_url_params( params["sort"] = "updated" params["direction"] = "desc" if self.missing_since_parameter else "asc" - # Unfortunately the /starred, /stargazers (starred_at) and /events (created_at) endpoints do not support + # Unfortunately the /starred (starred_at) and /events (created_at) endpoints do not support # the "since" parameter out of the box. But we use a workaround in 'get_next_page_token'. elif self.replication_key in ["starred_at", "created_at"]: params["sort"] = "created" diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index c05acafa..6871b577 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -6,6 +6,9 @@ from singer_sdk import typing as th # JSON Schema typing helpers from singer_sdk.helpers.jsonpath import extract_jsonpath +from dateutil.parser import parse +from urllib.parse import parse_qs, urlparse + from tap_github.client import GitHubGraphqlStream, GitHubRestStream from tap_github.schema_objects import ( user_object, @@ -1496,6 +1499,59 @@ def http_headers(self) -> dict: headers["Accept"] = "application/vnd.github.v3.star+json" return headers + def get_next_page_token( + self, response: requests.Response, previous_token: Optional[Any] + ) -> Optional[Any]: + """Return a token for identifying next page or None if no more pages. + + /stargazers does not have "since" or "direction" parameters. If a since parameter is passed, + we manually paginate backwards and try to exit early. + + WARNING - This does not work for repositories with too many stars like facebook/react. + See https://github.com/MeltanoLabs/tap-github/issues/120 + """ + # Leverage header links returned by the GitHub API. + if not ("next" in response.links.keys() or "prev" in response.links.keys()): + return None + + results = response.json() + request_parameters = parse_qs(str(urlparse(response.request.url).query)) + + # parse_qs interprets "+" as a space, revert this to keep an aware datetime + try: + since = ( + request_parameters["since"][0].replace(" ", "+") + if "since" in request_parameters + else "" + ) + except IndexError: + since = "" + + # if "since" is present, paginate backwards. + next_page_key = "prev" if since else "next" + + # start with the last page + if (previous_token or 1) == 1 and "last" in response.links.keys(): + next_page_key = "last" + # if possible, exit early + elif not results or len(results) < self.MAX_PER_PAGE: + return None + elif since and (parse(results[-1][self.replication_key]) < parse(since)): + return None + + # Use header links returned by the GitHub API. + parsed_url = urlparse(response.links[next_page_key]["url"]) + captured_page_value_list = parse_qs(parsed_url.query).get("page") + next_page_string = ( + captured_page_value_list[0] if captured_page_value_list else None + ) + if next_page_string and next_page_string.isdigit(): + print(next_page_key) + print(int(next_page_string)) + return int(next_page_string) + + return (previous_token or 1) + 1 + def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: """ Add a user_id top-level field to be used as state replication key. From 67fbe9476120bbbe33c2a95fe6e44f8968da5b87 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 23:01:56 -0400 Subject: [PATCH 2/2] remove :footprints: --- tap_github/repository_streams.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index 6871b577..349b6298 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1546,8 +1546,6 @@ def get_next_page_token( captured_page_value_list[0] if captured_page_value_list else None ) if next_page_string and next_page_string.isdigit(): - print(next_page_key) - print(int(next_page_string)) return int(next_page_string) return (previous_token or 1) + 1