From 2b6a3f142a8035affc3a7ad974809da92e9149f2 Mon Sep 17 00:00:00 2001 From: Ryan Date: Mon, 13 Sep 2021 01:18:36 +1000 Subject: [PATCH 01/10] Add readme stream --- .gitignore | 3 +++ tap_github/streams.py | 38 ++++++++++++++++++++++++++++++++++++-- tap_github/tap.py | 3 ++- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 1e3ab0e6..352aa6b3 100644 --- a/.gitignore +++ b/.gitignore @@ -134,6 +134,9 @@ venv.bak/ .dmypy.json dmypy.json +# IDE +.idea/ + # Pyre type checker .pyre/ diff --git a/tap_github/streams.py b/tap_github/streams.py index ddf79cb2..937f20d9 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Iterable, List, Optional +import requests from singer_sdk import typing as th # JSON Schema typing helpers from tap_github.client import GitHubStream @@ -17,7 +18,7 @@ class RepositoryStream(GitHubStream): name = "repositories" def get_url_params( - self, context: Optional[dict], next_page_token: Optional[Any] + self, context: Optional[dict], next_page_token: Optional[Any] ) -> Dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" assert context is not None, f"Context cannot be empty for '{self.name}' stream." @@ -92,6 +93,39 @@ def get_child_context(self, record: dict, context: Optional[dict]) -> dict: ).to_dict() +class ReadmeStream(GitHubStream): + name = "readme" + path = "/repos/{org}/{repo}/readme" + primary_keys = ["url"] + # TODO what would this be? replication_key = "updated_at" + parent_stream_type = RepositoryStream + ignore_parent_replication_key = False + state_partitioning_keys = ["repo", "org"] + + def parse_response(self, response: requests.Response) -> Iterable[dict]: + json = response.json() + yield json + + schema = th.PropertiesList( + th.Property("type", th.StringType), + th.Property("encoding", th.StringType), + th.Property("size", th.IntegerType), + th.Property("name", th.StringType), + th.Property("path", th.StringType), + th.Property("content", th.StringType), + th.Property("sha", th.StringType), + th.Property("url", th.StringType), + th.Property("git_url", th.StringType), + th.Property("html_url", th.StringType), + th.Property("download_url", th.StringType), + th.Property("_links", th.PropertiesList( + th.Property("git", th.StringType), + th.Property("self", th.StringType), + th.Property("html", th.StringType), + )), + ).to_dict() + + class IssuesStream(GitHubStream): """Defines 'Issues' stream.""" @@ -167,7 +201,7 @@ def get_records(self, context: Optional[dict] = None) -> Iterable[Dict[str, Any] return super().get_records(context) def get_url_params( - self, context: Optional[dict], next_page_token: Optional[Any] + self, context: Optional[dict], next_page_token: Optional[Any] ) -> Dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" params = super().get_url_params(context, next_page_token) diff --git a/tap_github/tap.py b/tap_github/tap.py index b7945a5f..9a056712 100644 --- a/tap_github/tap.py +++ b/tap_github/tap.py @@ -5,7 +5,7 @@ from singer_sdk import Tap, Stream from singer_sdk import typing as th # JSON schema typing helpers -from tap_github.streams import RepositoryStream, IssuesStream, IssueCommentsStream +from tap_github.streams import RepositoryStream, IssuesStream, IssueCommentsStream, ReadmeStream class TapGitHub(Tap): @@ -38,6 +38,7 @@ def discover_streams(self) -> List[Stream]: RepositoryStream(tap=self), IssuesStream(tap=self), IssueCommentsStream(tap=self), + ReadmeStream(tap=self), ] From eb242ca66367c30f9627475920d918ca81971267 Mon Sep 17 00:00:00 2001 From: Ryan Date: Mon, 13 Sep 2021 01:19:22 +1000 Subject: [PATCH 02/10] run formatter --- tap_github/streams.py | 17 ++++++++++------- tap_github/tap.py | 7 ++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 937f20d9..3e200108 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -18,7 +18,7 @@ class RepositoryStream(GitHubStream): name = "repositories" def get_url_params( - self, context: Optional[dict], next_page_token: Optional[Any] + self, context: Optional[dict], next_page_token: Optional[Any] ) -> Dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" assert context is not None, f"Context cannot be empty for '{self.name}' stream." @@ -118,11 +118,14 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: th.Property("git_url", th.StringType), th.Property("html_url", th.StringType), th.Property("download_url", th.StringType), - th.Property("_links", th.PropertiesList( - th.Property("git", th.StringType), - th.Property("self", th.StringType), - th.Property("html", th.StringType), - )), + th.Property( + "_links", + th.PropertiesList( + th.Property("git", th.StringType), + th.Property("self", th.StringType), + th.Property("html", th.StringType), + ), + ), ).to_dict() @@ -201,7 +204,7 @@ def get_records(self, context: Optional[dict] = None) -> Iterable[Dict[str, Any] return super().get_records(context) def get_url_params( - self, context: Optional[dict], next_page_token: Optional[Any] + self, context: Optional[dict], next_page_token: Optional[Any] ) -> Dict[str, Any]: """Return a dictionary of values to be used in URL parameterization.""" params = super().get_url_params(context, next_page_token) diff --git a/tap_github/tap.py b/tap_github/tap.py index 9a056712..d1005c80 100644 --- a/tap_github/tap.py +++ b/tap_github/tap.py @@ -5,7 +5,12 @@ from singer_sdk import Tap, Stream from singer_sdk import typing as th # JSON schema typing helpers -from tap_github.streams import RepositoryStream, IssuesStream, IssueCommentsStream, ReadmeStream +from tap_github.streams import ( + RepositoryStream, + IssuesStream, + IssueCommentsStream, + ReadmeStream, +) class TapGitHub(Tap): From e24b0263e8c5773ea6efefb6e0a90ed7c84f7758 Mon Sep 17 00:00:00 2001 From: Ryan Samarakoon Date: Tue, 14 Sep 2021 00:27:12 +1000 Subject: [PATCH 03/10] Change to object type --- tap_github/streams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index e0fbe4f2..eb467aea 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -186,7 +186,7 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: th.Property("download_url", th.StringType), th.Property( "_links", - th.PropertiesList( + th.ObjectType( th.Property("git", th.StringType), th.Property("self", th.StringType), th.Property("html", th.StringType), From 9c9960d047e62d7f63eb385e2e9fbadba6340906 Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 16 Sep 2021 01:17:13 +1000 Subject: [PATCH 04/10] Address comments --- tap_github/streams.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 11588363..e1de1a24 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -2,8 +2,6 @@ import requests from typing import Any, Dict, Iterable, List, Optional - -import requests from singer_sdk import typing as th # JSON Schema typing helpers from tap_github.client import GitHubStream @@ -169,8 +167,7 @@ class ReadmeStream(GitHubStream): state_partitioning_keys = ["repo", "org"] def parse_response(self, response: requests.Response) -> Iterable[dict]: - json = response.json() - yield json + return [response.json()] schema = th.PropertiesList( th.Property("type", th.StringType), From 39fd138c1a3d4632cfe1390c546934dc743388d7 Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 16 Sep 2021 01:20:44 +1000 Subject: [PATCH 05/10] Remove updated_at todo --- tap_github/streams.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index e1de1a24..258f2b1b 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -161,7 +161,6 @@ class ReadmeStream(GitHubStream): name = "readme" path = "/repos/{org}/{repo}/readme" primary_keys = ["url"] - # TODO what would this be? replication_key = "updated_at" parent_stream_type = RepositoryStream ignore_parent_replication_key = False state_partitioning_keys = ["repo", "org"] From 6f16d6b9fa50d6320112af68c5d6550e0b75fd54 Mon Sep 17 00:00:00 2001 From: Ryan Samarakoon Date: Thu, 16 Sep 2021 16:43:15 +1000 Subject: [PATCH 06/10] Use repo/org primary key and add to properties --- tap_github/streams.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 258f2b1b..1d808dab 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -160,7 +160,7 @@ def get_child_context(self, record: dict, context: Optional[dict]) -> dict: class ReadmeStream(GitHubStream): name = "readme" path = "/repos/{org}/{repo}/readme" - primary_keys = ["url"] + primary_keys = ["repo", "org"] parent_stream_type = RepositoryStream ignore_parent_replication_key = False state_partitioning_keys = ["repo", "org"] @@ -169,6 +169,10 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: return [response.json()] schema = th.PropertiesList( + # Parent Keys + th.Property("repo", th.StringType), + th.Property("org", th.StringType), + # README Keys th.Property("type", th.StringType), th.Property("encoding", th.StringType), th.Property("size", th.IntegerType), From b3b90cb0b5547525944c54c8074d324aa29e49c6 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 16 Sep 2021 14:04:00 -0700 Subject: [PATCH 07/10] Add CommunityProfile stream --- tap_github/streams.py | 86 +++++++++++++++++++++++++++++++++++++++++++ tap_github/tap.py | 2 + 2 files changed, 88 insertions(+) diff --git a/tap_github/streams.py b/tap_github/streams.py index 1d808dab..333a1e6b 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -195,6 +195,92 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: ).to_dict() +class CommunityProfileStream(GitHubStream): + """Defines 'CommunityProfile' stream.""" + + name = "community_profile" + path = "/repos/{org}/{repo}/community/profile" + primary_keys = ["repo", "org"] + parent_stream_type = RepositoryStream + ignore_parent_replication_key = False + state_partitioning_keys = ["repo", "org"] + + def parse_response(self, response: requests.Response) -> Iterable[dict]: + return [response.json()] + + schema = th.PropertiesList( + # Parent Keys + th.Property("repo", th.StringType), + th.Property("org", th.StringType), + # Community Profile + th.Property("health_percentage", th.IntegerType), + th.Property("description", th.StringType), + th.Property("documentation", th.StringType), + th.Property("updated_at", th.DateTimeType), + th.Property("content_reports_enabled", th.BooleanType), + th.Property( + "files", + th.ObjectType( + th.Property( + "code_of_conduct", + th.ObjectType( + th.Property("key", th.StringType), + th.Property("name", th.StringType), + th.Property("html_url", th.StringType), + th.Property("url", th.StringType), + ) + ), + th.Property( + "code_of_conduct_file", + th.ObjectType( + th.Property("url", th.StringType), + th.Property("html_url", th.StringType), + ) + ), + th.Property( + "contributing", + th.ObjectType( + th.Property("url", th.StringType), + th.Property("html_url", th.StringType), + ) + ), + th.Property( + "issue_template", + th.ObjectType( + th.Property("url", th.StringType), + th.Property("html_url", th.StringType), + ) + ), + th.Property( + "pull_request_template", + th.ObjectType( + th.Property("url", th.StringType), + th.Property("html_url", th.StringType), + ) + ), + th.Property( + "license", + th.ObjectType( + th.Property("key", th.StringType), + th.Property("name", th.StringType), + th.Property("spdx_id", th.StringType), + th.Property("node_id", th.StringType), + th.Property("html_url", th.StringType), + th.Property("url", th.StringType), + ) + ), + th.Property( + "readme", + th.ObjectType( + th.Property("url", th.StringType), + th.Property("html_url", th.StringType), + ) + ), + ), + ), + ).to_dict() + + class IssuesStream(GitHubStream): """Defines 'Issues' stream.""" diff --git a/tap_github/tap.py b/tap_github/tap.py index cd5330c5..00a28427 100644 --- a/tap_github/tap.py +++ b/tap_github/tap.py @@ -10,6 +10,7 @@ IssuesStream, IssueCommentsStream, ReadmeStream, + CommunityProfileStream, ) @@ -44,6 +45,7 @@ def discover_streams(self) -> List[Stream]: IssuesStream(tap=self), IssueCommentsStream(tap=self), ReadmeStream(tap=self), + CommunityProfileStream(tap=self), ] From da217538c25338731b803c007ece84ee58243500 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 16 Sep 2021 17:30:07 -0700 Subject: [PATCH 08/10] Override _request_with_backoff and introduce tolerated_http_errors --- tap_github/client.py | 51 ++++++++++++++++++++++++++++++++++++++++++- tap_github/streams.py | 16 +------------- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 20f86eee..0f709f96 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -20,6 +20,7 @@ def url_base(self) -> str: primary_keys = ["id"] replication_key: Optional[str] = None + tolerated_http_errors = [] @property def http_headers(self) -> dict: @@ -71,13 +72,61 @@ def get_url_params( params["since"] = since return params + def _request_with_backoff( + self, prepared_request, context: Optional[dict] + ) -> requests.Response: + """Override private method _request_with_backoff to account for expected 404 Not Found erros.""" + # TODO - Adapt Singer + response = self.requests_session.send(prepared_request) + if self._LOG_REQUEST_METRICS: + extra_tags = {} + if self._LOG_REQUEST_METRIC_URLS: + extra_tags["url"] = cast(str, prepared_request.path_url) + self._write_request_duration_log( + endpoint=self.path, + response=response, + context=context, + extra_tags=extra_tags, + ) + if response.status_code in self.tolerated_http_errors: + self.logger.info("Request returned a tolerated error for {}".format(prepared_request.url)) + self.logger.info( + f"Reason: {response.status_code} - {str(response.content)}" + ) + return response + + if response.status_code in [401, 403]: + self.logger.info("Failed request for {}".format(prepared_request.url)) + self.logger.info( + f"Reason: {response.status_code} - {str(response.content)}" + ) + raise RuntimeError( + "Requested resource was unauthorized, forbidden, or not found." + ) + elif response.status_code >= 400: + raise RuntimeError( + f"Error making request to API: {prepared_request.url} " + f"[{response.status_code} - {str(response.content)}]".replace( + "\\n", "\n" + ) + ) + self.logger.debug("Response received successfully.") + return response + def parse_response(self, response: requests.Response) -> Iterable[dict]: """Parse the response and return an iterator of result rows.""" + # TODO - Split into handle_reponse and parse_response. + if response.status_code in self.tolerated_http_errors: + return [] + resp_json = response.json() + if isinstance(resp_json, list): results = resp_json - else: + elif resp_json.get("items") is not None: results = resp_json.get("items") + else: + results = [resp_json] for row in results: yield row diff --git a/tap_github/streams.py b/tap_github/streams.py index 333a1e6b..7601f4d9 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -57,15 +57,6 @@ def partitions(self) -> Optional[List[Dict]]: return [{"org": r[0], "repo": r[1]} for r in split_repo_names] return None - def parse_response(self, response: requests.Response) -> Iterable[dict]: - """ - Parse the response which differs for this stream depending on which mode it is run in. - """ - if "searches" in self.config: - return super(GitHubStream, self).parse_response(response) - else: - return [response.json()] - def get_child_context(self, record: dict, context: Optional[dict]) -> dict: """Return a child context object from the record and optional provided context. @@ -165,9 +156,6 @@ class ReadmeStream(GitHubStream): ignore_parent_replication_key = False state_partitioning_keys = ["repo", "org"] - def parse_response(self, response: requests.Response) -> Iterable[dict]: - return [response.json()] - schema = th.PropertiesList( # Parent Keys th.Property("repo", th.StringType), @@ -204,9 +192,7 @@ class CommunityProfileStream(GitHubStream): parent_stream_type = RepositoryStream ignore_parent_replication_key = False state_partitioning_keys = ["repo", "org"] - - def parse_response(self, response: requests.Response) -> Iterable[dict]: - return [response.json()] + tolerated_http_errors = [404] schema = th.PropertiesList( # Parent Keys From d89e49215ad7aacb35bb907abdb87ea96b78bac8 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 23 Sep 2021 10:38:55 -0700 Subject: [PATCH 09/10] Linting --- tap_github/client.py | 4 +++- tap_github/streams.py | 14 +++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 0f709f96..8e3aea80 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -89,7 +89,9 @@ def _request_with_backoff( extra_tags=extra_tags, ) if response.status_code in self.tolerated_http_errors: - self.logger.info("Request returned a tolerated error for {}".format(prepared_request.url)) + self.logger.info( + "Request returned a tolerated error for {}".format(prepared_request.url) + ) self.logger.info( f"Reason: {response.status_code} - {str(response.content)}" ) diff --git a/tap_github/streams.py b/tap_github/streams.py index 7c6b0e99..41331dd2 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -213,35 +213,35 @@ class CommunityProfileStream(GitHubStream): th.Property("name", th.StringType), th.Property("html_url", th.StringType), th.Property("url", th.StringType), - ) + ), ), th.Property( "code_of_conduct_file", th.ObjectType( th.Property("url", th.StringType), th.Property("html_url", th.StringType), - ) + ), ), th.Property( "contributing", th.ObjectType( th.Property("url", th.StringType), th.Property("html_url", th.StringType), - ) + ), ), th.Property( "issue_template", th.ObjectType( th.Property("url", th.StringType), th.Property("html_url", th.StringType), - ) + ), ), th.Property( "pull_request_template", th.ObjectType( th.Property("url", th.StringType), th.Property("html_url", th.StringType), - ) + ), ), th.Property( "license", @@ -252,14 +252,14 @@ class CommunityProfileStream(GitHubStream): th.Property("node_id", th.StringType), th.Property("html_url", th.StringType), th.Property("url", th.StringType), - ) + ), ), th.Property( "readme", th.ObjectType( th.Property("url", th.StringType), th.Property("html_url", th.StringType), - ) + ), ), ), ), From 497e040299eb4568defc8dcbd251828cc3fc0a1a Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 23 Sep 2021 14:00:14 -0700 Subject: [PATCH 10/10] Typing --- tap_github/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index 8e3aea80..2ca3db3c 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -1,7 +1,7 @@ """REST client handling, including GitHubStream base class.""" import requests -from typing import Any, Dict, Optional, Iterable, cast +from typing import Any, Dict, List, Optional, Iterable, cast from singer_sdk.streams import RESTStream @@ -20,7 +20,7 @@ def url_base(self) -> str: primary_keys = ["id"] replication_key: Optional[str] = None - tolerated_http_errors = [] + tolerated_http_errors: List[int] = [] @property def http_headers(self) -> dict: