Skip to content

Commit

Permalink
feat: Add traffic streams (#193)
Browse files Browse the repository at this point in the history
* Fix stargazers empty

* Have to convert to list first

* Add new streams to collect traffic data

* Run black formatter

* Fix error message

* Apply suggestion

* Sorry, just realized my primary key was wrong

* Add condition to skip invalid data

* Add note about permissions

* Accept Edgar's suggestion

Co-authored-by: Edgar R. M. <[email protected]>

* Accept Edgar's suggestion

Co-authored-by: Edgar R. M. <[email protected]>

* Update tap_github/repository_streams.py

Co-authored-by: Edgar R. M. <[email protected]>

* Update README.md

Co-authored-by: Edgar R. M. <[email protected]>

---------

Co-authored-by: Edgar R. M. <[email protected]>
  • Loading branch information
sicarul and edgarrmondragon authored May 5, 2023
1 parent a885286 commit ec5ba3b
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 0 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ To avoid this, the GitHub streams will exit early. I.e. when there are no more `

You can easily run `tap-github` by itself or in a pipeline using [Meltano](www.meltano.com).

### Notes regarding permissions

* For the `traffic_*` streams, [you will need write access to the repository](https://docs.github.com/en/rest/metrics/traffic?apiVersion=2022-11-28). You can enable extraction for these streams by [selecting them in the catalog](https://hub.meltano.com/singer/spec/#metadata).

### Executing the Tap Directly

```bash
Expand Down
131 changes: 131 additions & 0 deletions tap_github/repository_streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -2244,3 +2244,134 @@ def query(self) -> str:
),
),
).to_dict()


class TrafficRestStream(GitHubRestStream):
"""Base class for Traffic Streams"""

@property
def metadata(self):
"""Override default selection metadata for this stream.
TODO: Remove this in favor of the recommended approach when the SDK has one.
"""
result = super().metadata
if self._tap_input_catalog is None:
result.root.selected = False
return result

def parse_response(self, response: requests.Response) -> Iterable[dict]:
if response.status_code != 200:
return []

"""Parse the response and return an iterator of result rows."""
yield from extract_jsonpath(self.records_jsonpath, input=response.json())

def validate_response(self, response: requests.Response) -> None:
"""Allow some specific errors.
Do not raise exceptions if the error says "Must have push access to repository"
as we actually expect these in this stream when we don't have write permissions into it.
"""
if response.status_code == 403:
contents = response.json()
if contents["message"] == "Resource not accessible by integration":
self.logger.info("Permissions missing to sync stream '%s'", self.name)
return
super().validate_response(response)


class TrafficClonesStream(TrafficRestStream):
"""Defines 'traffic_clones' stream."""

name = "traffic_clones"
path = "/repos/{org}/{repo}/traffic/clones"
primary_keys = ["repo", "org", "timestamp"]
replication_key = "timestamp"
parent_stream_type = RepositoryStream
ignore_parent_replication_key = True
state_partitioning_keys = ["repo", "org"]
records_jsonpath = "$.clones[*]"

schema = th.PropertiesList(
# Parent keys
th.Property("repo", th.StringType),
th.Property("org", th.StringType),
th.Property("repo_id", th.IntegerType),
# Clones Data
th.Property("timestamp", th.DateTimeType),
th.Property("count", th.IntegerType),
th.Property("uniques", th.IntegerType),
).to_dict()


class TrafficReferralPathsStream(TrafficRestStream):
"""Defines 'traffic_referral_paths' stream."""

name = "traffic_referral_paths"
path = "/repos/{org}/{repo}/traffic/popular/paths"
primary_keys = ["repo", "org", "path"]
replication_key = None
parent_stream_type = RepositoryStream
ignore_parent_replication_key = True
state_partitioning_keys = ["repo", "org"]
records_jsonpath = "[*]"

schema = th.PropertiesList(
# Parent keys
th.Property("repo", th.StringType),
th.Property("org", th.StringType),
th.Property("repo_id", th.IntegerType),
# Referral path data
th.Property("path", th.StringType),
th.Property("title", th.StringType),
th.Property("count", th.IntegerType),
th.Property("uniques", th.IntegerType),
).to_dict()


class TrafficReferrersStream(TrafficRestStream):
"""Defines 'traffic_referrers' stream."""

name = "traffic_referrers"
path = "/repos/{org}/{repo}/traffic/popular/referrers"
primary_keys = ["repo", "org", "referrer"]
replication_key = None
parent_stream_type = RepositoryStream
ignore_parent_replication_key = True
state_partitioning_keys = ["repo", "org"]
records_jsonpath = "[*]"

schema = th.PropertiesList(
# Parent keys
th.Property("repo", th.StringType),
th.Property("org", th.StringType),
th.Property("repo_id", th.IntegerType),
# Referrer data
th.Property("referrer", th.StringType),
th.Property("count", th.IntegerType),
th.Property("uniques", th.IntegerType),
).to_dict()


class TrafficPageViewsStream(TrafficRestStream):
"""Defines 'traffic_pageviews' stream."""

name = "traffic_pageviews"
path = "/repos/{org}/{repo}/traffic/views"
primary_keys = ["repo", "org", "timestamp"]
replication_key = None
parent_stream_type = RepositoryStream
ignore_parent_replication_key = True
state_partitioning_keys = ["repo", "org"]
records_jsonpath = "$.views[*]"

schema = th.PropertiesList(
# Parent keys
th.Property("repo", th.StringType),
th.Property("org", th.StringType),
th.Property("repo_id", th.IntegerType),
# Page view data
th.Property("timestamp", th.DateTimeType),
th.Property("count", th.IntegerType),
th.Property("uniques", th.IntegerType),
).to_dict()
8 changes: 8 additions & 0 deletions tap_github/streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
StargazersGraphqlStream,
StargazersStream,
StatsContributorsStream,
TrafficClonesStream,
TrafficPageViewsStream,
TrafficReferralPathsStream,
TrafficReferrersStream,
WorkflowRunJobsStream,
WorkflowRunsStream,
WorkflowsStream,
Expand Down Expand Up @@ -94,6 +98,10 @@ def __init__(self, valid_queries: Set[str], streams: List[Type[Stream]]):
StargazersGraphqlStream,
StargazersStream,
StatsContributorsStream,
TrafficClonesStream,
TrafficPageViewsStream,
TrafficReferralPathsStream,
TrafficReferrersStream,
WorkflowRunJobsStream,
WorkflowRunsStream,
WorkflowsStream,
Expand Down

0 comments on commit ec5ba3b

Please sign in to comment.