From 67f4d32d13d62220092471ec8afb352793b04fe0 Mon Sep 17 00:00:00 2001 From: Josh lloyd Date: Tue, 12 Dec 2023 15:35:59 -0700 Subject: [PATCH 1/6] added multiple sheets feature --- .gitignore | 8 ++- README.md | 19 +++++-- meltano.yml | 1 + tap_google_sheets/streams.py | 3 +- tap_google_sheets/tap.py | 96 +++++++++++++++++++++--------------- 5 files changed, 83 insertions(+), 44 deletions(-) diff --git a/.gitignore b/.gitignore index 186388e..7a86c60 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ .meltano/ +# Ignore meltano plugins lock files +plugins/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -136,4 +139,7 @@ dmypy.json .pyre/ # VSCode files -.vscode \ No newline at end of file +.vscode + +# intellij +.idea \ No newline at end of file diff --git a/README.md b/README.md index 2c55a40..c1b9da2 100644 --- a/README.md +++ b/README.md @@ -56,11 +56,21 @@ Your `sheet_id` are the characters after `spreadsheets/d/`, so in this case woul ### Credentials +Setting | Required | Type | Description | +------- | -------- |------------------| ----------- | +`oauth_credentials.client_id` | Required | String | Your google client id +`oauth_credentials.client_secret` | Required | String | Your google client secret +`oauth_credentials.refresh_token` | Required | String | Your google refresh token +`sheet_id` | Required | String | Your target google sheet id +`stream_name` | Optional | String | Optionailly rename the stream and output file or table from the tap +`child_sheet_name` | Optional | String | Optionally choose a different sheet from your Google Sheet file +`key_properties` | Optional | Array of Strings | Optionally choose primary key column(s) from your Google Sheet file. Example: `["column_one", "column_two"]` +`sheets` | Optional | Array of Objects | Optionally provide a list of configs for each sheet/stream. See "Per Sheet Config" below. Overrides the `sheet_id` provided at the root level. + +### Per Sheet Config + Setting | Required | Type | Description | ------- | -------- | ---- | ----------- | -`oauth_credentials.client_id` | Required | String | Your google client id -`oauth_credentials.client_secret` | Required | String | Your google client secret -`oauth_credentials.refresh_token` | Required | String | Your google refresh token `sheet_id` | Required | String | Your target google sheet id `stream_name` | Optional | String | Optionailly rename the stream and output file or table from the tap `child_sheet_name` | Optional | String | Optionally choose a different sheet from your Google Sheet file @@ -76,6 +86,7 @@ These settings expand into environment variables of: - `TAP_GOOGLE_SHEETS_STREAM_NAME` - `TAP_GOOGLE_SHEETS_CHILD_SHEET_NAME` - `TAP_GOOGLE_SHEETS_KEY_PROPERTIES` +- `TAP_GOOGLE_SHEETS_SHEETS` --- @@ -97,6 +108,8 @@ These settings expand into environment variables of: * When using the `key_properties` setting, you must choose columns with no null values. +* You can extract multiple sheets using the `sheets` config, which is just an array containing configurable properties for each item. Doing so will ignore any sheet config defined by the root level `sheet_id`, `stream_name`, `child_sheet_name`, `key_properties` properties. + ### Loaders Tested - [target-jsonl](https://hub.meltano.com/targets/jsonl) diff --git a/meltano.yml b/meltano.yml index db16a83..e58fd51 100644 --- a/meltano.yml +++ b/meltano.yml @@ -22,6 +22,7 @@ plugins: - name: sheet_id - name: stream_name - name: child_sheet_name + - name: sheets loaders: - name: target-jsonl variant: andyh1203 diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py index 32f3e1a..20797b2 100644 --- a/tap_google_sheets/streams.py +++ b/tap_google_sheets/streams.py @@ -18,11 +18,12 @@ class GoogleSheetsStream(GoogleSheetsBaseStream): child_sheet_name = None primary_key = None url_base = "https://sheets.googleapis.com/v4/spreadsheets" + stream_config = None @property def path(self): """Set the path for the stream.""" - return f"/{self.config['sheet_id']}/values/{self.child_sheet_name}" + return f"/{self.stream_config['sheet_id']}/values/{self.child_sheet_name}" def parse_response(self, response: requests.Response) -> Iterable[dict]: """Parse response, build response back up into json, update stream schema.""" diff --git a/tap_google_sheets/tap.py b/tap_google_sheets/tap.py index f87c8ee..6675fc8 100644 --- a/tap_google_sheets/tap.py +++ b/tap_google_sheets/tap.py @@ -15,22 +15,7 @@ class TapGoogleSheets(Tap): name = "tap-google-sheets" - config_jsonschema = th.PropertiesList( - th.Property( - "oauth_credentials.client_id", - th.StringType, - description="Your google client_id", - ), - th.Property( - "oauth_credentials.client_secret", - th.StringType, - description="Your google client_secret", - ), - th.Property( - "oauth_credentials.refresh_token", - th.StringType, - description="Your google refresh token", - ), + per_sheet_config = th.ObjectType( th.Property("sheet_id", th.StringType, description="Your google sheet id"), th.Property( "output_name", @@ -42,7 +27,7 @@ class TapGoogleSheets(Tap): "child_sheet_name", th.StringType, description="Optionally sync data from a different sheet in" - + " your Google Sheet", + + " your Google Sheet", required=False, ), th.Property( @@ -51,42 +36,75 @@ class TapGoogleSheets(Tap): description="Optionally choose one or more primary key columns", required=False, ), - ).to_dict() + ) + + config_jsonschema = th.PropertiesList( + th.Property( + "oauth_credentials.client_id", + th.StringType, + description="Your google client_id", + ), + th.Property( + "oauth_credentials.client_secret", + th.StringType, + description="Your google client_secret", + ), + th.Property( + "oauth_credentials.refresh_token", + th.StringType, + description="Your google refresh token", + ), + th.Property( + "sheets", + required=False, + description="The list of configs for each sheet/stream.", + wrapped=th.ArrayType(per_sheet_config), + ) + ) + + for prop in per_sheet_config.wrapped.values(): + # raise Exception(prop.name) + config_jsonschema.append(prop) + + config_jsonschema = config_jsonschema.to_dict() def discover_streams(self) -> List[Stream]: """Return a list of discovered streams.""" streams: List[Stream] = [] - stream_name = self.config.get("stream_name") or self.get_sheet_name() - stream_name = stream_name.replace(" ", "_") - key_properties = self.config.get("key_properties", []) + sheets = self.config.get("sheets") or [self.config] + for stream_config in sheets: + stream_name = stream_config.get("stream_name") or self.get_sheet_name() + stream_name = stream_name.replace(" ", "_") + key_properties = stream_config.get("key_properties", []) - google_sheet_data = self.get_sheet_data() + google_sheet_data = self.get_sheet_data(stream_config) - stream_schema = self.get_schema(google_sheet_data) + stream_schema = self.get_schema(google_sheet_data) - child_sheet_name = self.config.get( - "child_sheet_name" - ) or self.get_first_visible_child_sheet_name(google_sheet_data) + child_sheet_name = self.config.get( + "child_sheet_name" + ) or self.get_first_visible_child_sheet_name(google_sheet_data) - if stream_name: - stream = GoogleSheetsStream( - tap=self, name=stream_name, schema=stream_schema - ) - stream.child_sheet_name = child_sheet_name - stream.selected - stream.primary_keys = key_properties - streams.append(stream) + if stream_name: + stream = GoogleSheetsStream( + tap=self, name=stream_name, schema=stream_schema + ) + stream.child_sheet_name = child_sheet_name + stream.selected + stream.primary_keys = key_properties + stream.stream_config = stream_config + streams.append(stream) return streams - def get_sheet_name(self): + def get_sheet_name(self, stream_config): """Get the name of the spreadsheet.""" config_stream = GoogleSheetsBaseStream( tap=self, name="config", schema={"one": "one"}, - path="https://www.googleapis.com/drive/v2/files/" + self.config["sheet_id"], + path="https://www.googleapis.com/drive/v2/files/" + stream_config["sheet_id"], ) prepared_request = config_stream.prepare_request(None, None) @@ -112,16 +130,16 @@ def get_first_visible_child_sheet_name(self, google_sheet_data: requests.Respons return sheet_in_sheet_name - def get_sheet_data(self): + def get_sheet_data(self, stream_config): """Get the data from the selected or first visible sheet in the google sheet.""" config_stream = GoogleSheetsBaseStream( tap=self, name="config", schema={"not": "null"}, path="https://sheets.googleapis.com/v4/spreadsheets/" - + self.config["sheet_id"] + + stream_config["sheet_id"] + "/values/" - + self.config.get("child_sheet_name", "") + + stream_config.get("child_sheet_name", "") + "!1:1", ) From 1b8e84d95db48e8b1d0d4bc8de030df5031ca2e7 Mon Sep 17 00:00:00 2001 From: Reuben Frankel Date: Thu, 18 Jan 2024 00:43:25 +0000 Subject: [PATCH 2/6] Fix `TypeError: get_sheet_name() missing 1 required positional argument: 'stream_config'` --- tap_google_sheets/tap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_google_sheets/tap.py b/tap_google_sheets/tap.py index fd97448..c1cace0 100644 --- a/tap_google_sheets/tap.py +++ b/tap_google_sheets/tap.py @@ -75,7 +75,7 @@ def discover_streams(self) -> List[Stream]: sheets = self.config.get("sheets") or [self.config] for stream_config in sheets: - stream_name = stream_config.get("output_name") or self.get_sheet_name() + stream_name = stream_config.get("output_name") or self.get_sheet_name(stream_config) stream_name = stream_name.replace(" ", "_") key_properties = stream_config.get("key_properties", []) From 6022636a9e1cf235c48f45c12feae986318767b1 Mon Sep 17 00:00:00 2001 From: Reuben Frankel Date: Thu, 18 Jan 2024 00:47:49 +0000 Subject: [PATCH 3/6] Fix lint issues --- tap_google_sheets/tap.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tap_google_sheets/tap.py b/tap_google_sheets/tap.py index c1cace0..356ae9d 100644 --- a/tap_google_sheets/tap.py +++ b/tap_google_sheets/tap.py @@ -27,8 +27,9 @@ class TapGoogleSheets(Tap): th.Property( "child_sheet_name", th.StringType, - description="Optionally sync data from a different sheet in" - + " your Google Sheet", + description=( + "Optionally sync data from a different sheet in your Google Sheet" + ), required=False, ), th.Property( @@ -39,7 +40,7 @@ class TapGoogleSheets(Tap): ), ) - config_jsonschema = th.PropertiesList( + base_config = th.PropertiesList( th.Property( "oauth_credentials.client_id", th.StringType, @@ -60,14 +61,14 @@ class TapGoogleSheets(Tap): required=False, description="The list of configs for each sheet/stream.", wrapped=th.ArrayType(per_sheet_config), - ) + ), ) for prop in per_sheet_config.wrapped.values(): # raise Exception(prop.name) - config_jsonschema.append(prop) + base_config.append(prop) - config_jsonschema = config_jsonschema.to_dict() + config_jsonschema = base_config.to_dict() def discover_streams(self) -> List[Stream]: """Return a list of discovered streams.""" @@ -75,7 +76,9 @@ def discover_streams(self) -> List[Stream]: sheets = self.config.get("sheets") or [self.config] for stream_config in sheets: - stream_name = stream_config.get("output_name") or self.get_sheet_name(stream_config) + stream_name = stream_config.get("output_name") or self.get_sheet_name( + stream_config + ) stream_name = stream_name.replace(" ", "_") key_properties = stream_config.get("key_properties", []) @@ -105,7 +108,8 @@ def get_sheet_name(self, stream_config): tap=self, name="config", schema={"one": "one"}, - path="https://www.googleapis.com/drive/v2/files/" + stream_config["sheet_id"], + path="https://www.googleapis.com/drive/v2/files/" + + stream_config["sheet_id"], ) prepared_request = config_stream.prepare_request(None, None) From de53f72963c8ba683f4c9c35d77a64e998b3c7d1 Mon Sep 17 00:00:00 2001 From: Reuben Frankel Date: Thu, 18 Jan 2024 11:20:44 +0000 Subject: [PATCH 4/6] Add test environment to `meltano.yml` --- meltano.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/meltano.yml b/meltano.yml index 7d9106b..2a4d5d5 100644 --- a/meltano.yml +++ b/meltano.yml @@ -1,6 +1,9 @@ version: 1 send_anonymous_usage_stats: true project_id: 04da77a3-af12-49a4-b9bf-3c22845918ba +default_environment: test +environments: +- name: test plugins: extractors: - name: tap-google-sheets From 63ad83e9852eb9c8c7c0e12b8cf5e7864294e388 Mon Sep 17 00:00:00 2001 From: Reuben Frankel Date: Thu, 18 Jan 2024 11:35:07 +0000 Subject: [PATCH 5/6] Specify `key_properties` setting in `meltano.yml` --- meltano.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/meltano.yml b/meltano.yml index 2a4d5d5..cbf391a 100644 --- a/meltano.yml +++ b/meltano.yml @@ -25,6 +25,8 @@ plugins: - name: sheet_id - name: output_name - name: child_sheet_name + - name: key_properties + kind: array loaders: - name: target-jsonl variant: andyh1203 From 622e57309d870328d61f25bd6dd7eb42b7c06e5f Mon Sep 17 00:00:00 2001 From: Reuben Frankel Date: Thu, 18 Jan 2024 12:35:33 +0000 Subject: [PATCH 6/6] Specify `array` kind for `sheets` setting in `meltano.yml` --- meltano.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/meltano.yml b/meltano.yml index c2f60d8..c58c518 100644 --- a/meltano.yml +++ b/meltano.yml @@ -28,6 +28,7 @@ plugins: - name: key_properties kind: array - name: sheets + kind: array loaders: - name: target-jsonl variant: andyh1203