Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paginate backwards for stargazers #121

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions tap_github/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,11 @@ def get_next_page_token(
else:
results = resp_json.get("items")

# Exit early if the response has no items. ? Maybe duplicative the "next" link check.
# Exit early if the response has no items. ? Maybe duplicative of the "next" link check.
if not results:
return None

# Unfortunately endpoints such as /starred, /stargazers, /events and /pulls do not support
# Unfortunately endpoints such as /starred, /events and /pulls do not support
# the "since" parameter out of the box. So we use a workaround here to exit early.
# For such streams, we sort by descending dates (most recent first), and paginate
# "back in time" until we reach records before our "since" parameter.
Expand Down Expand Up @@ -132,7 +132,7 @@ def get_url_params(
params["sort"] = "updated"
params["direction"] = "desc" if self.missing_since_parameter else "asc"

# Unfortunately the /starred, /stargazers (starred_at) and /events (created_at) endpoints do not support
# Unfortunately the /starred (starred_at) and /events (created_at) endpoints do not support
# the "since" parameter out of the box. But we use a workaround in 'get_next_page_token'.
elif self.replication_key in ["starred_at", "created_at"]:
params["sort"] = "created"
Expand Down
56 changes: 56 additions & 0 deletions tap_github/repository_streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from singer_sdk import typing as th # JSON Schema typing helpers
from singer_sdk.helpers.jsonpath import extract_jsonpath

from dateutil.parser import parse
from urllib.parse import parse_qs, urlparse

from tap_github.client import GitHubGraphqlStream, GitHubRestStream
from tap_github.schema_objects import (
user_object,
Expand Down Expand Up @@ -1496,6 +1499,59 @@ def http_headers(self) -> dict:
headers["Accept"] = "application/vnd.github.v3.star+json"
return headers

def get_next_page_token(
self, response: requests.Response, previous_token: Optional[Any]
) -> Optional[Any]:
"""Return a token for identifying next page or None if no more pages.

/stargazers does not have "since" or "direction" parameters. If a since parameter is passed,
we manually paginate backwards and try to exit early.

WARNING - This does not work for repositories with too many stars like facebook/react.
See https://github.com/MeltanoLabs/tap-github/issues/120
"""
# Leverage header links returned by the GitHub API.
if not ("next" in response.links.keys() or "prev" in response.links.keys()):
return None

results = response.json()
request_parameters = parse_qs(str(urlparse(response.request.url).query))

# parse_qs interprets "+" as a space, revert this to keep an aware datetime
try:
since = (
request_parameters["since"][0].replace(" ", "+")
if "since" in request_parameters
else ""
)
except IndexError:
since = ""

# if "since" is present, paginate backwards.
next_page_key = "prev" if since else "next"

# start with the last page
if (previous_token or 1) == 1 and "last" in response.links.keys():
next_page_key = "last"
# if possible, exit early
elif not results or len(results) < self.MAX_PER_PAGE:
return None
elif since and (parse(results[-1][self.replication_key]) < parse(since)):
return None

# Use header links returned by the GitHub API.
parsed_url = urlparse(response.links[next_page_key]["url"])
captured_page_value_list = parse_qs(parsed_url.query).get("page")
next_page_string = (
captured_page_value_list[0] if captured_page_value_list else None
)
if next_page_string and next_page_string.isdigit():
print(next_page_key)
print(int(next_page_string))
ericboucher marked this conversation as resolved.
Show resolved Hide resolved
return int(next_page_string)

return (previous_token or 1) + 1

def post_process(self, row: dict, context: Optional[Dict] = None) -> dict:
"""
Add a user_id top-level field to be used as state replication key.
Expand Down