diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml index 970c48a..2f73e78 100644 --- a/.github/workflows/ci_workflow.yml +++ b/.github/workflows/ci_workflow.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: # Only lint using the primary version used for dev - python-version: [3.9] + python-version: ['3.9'] steps: - uses: actions/checkout@v2 @@ -38,7 +38,7 @@ jobs: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: ['3.7', '3.8', '3.9'] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index 4cdd0a6..310c051 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ Setting | Required | Type | Description | `sheet_id` | Required | String | Your target google sheet id `output_name` | Optional | String | Optionailly rename the stream and output file or table from the tap `child_sheet_name` | Optional | String | Optionally choose a different sheet from your Google Sheet file +`range` | Optional | String | Optionally choose a range of data from your Google Sheet file (defaults to the entire sheet)

Range is defined using [A1 notation](https://developers.google.com/sheets/api/guides/concepts#expandable-1) and is start/end inclusive. Examples: `key_properties` | Optional | Array of Strings | Optionally choose primary key column(s) from your Google Sheet file. Example: `["column_one", "column_two"]` `sheets` | Optional | Array of Objects | Optionally provide a list of configs for each sheet/stream. See "Per Sheet Config" below. Overrides the `sheet_id` provided at the root level. @@ -74,6 +75,7 @@ Setting | Required | Type | Description | `sheet_id` | Required | String | Your target google sheet id `output_name` | Optional | String | Optionailly rename the stream and output file or table from the tap `child_sheet_name` | Optional | String | Optionally choose a different sheet from your Google Sheet file +`range` | Optional | String | Optionally choose a range of data from your Google Sheet file (defaults to the entire sheet)

Range is defined using [A1 notation](https://developers.google.com/sheets/api/guides/concepts#expandable-1) and is start/end inclusive. Examples: `key_properties` | Optional | Array of Strings | Optionally choose primary key column(s) from your Google Sheet file. Example: `["column_one", "column_two"]` ### Environment Variable @@ -85,6 +87,7 @@ These settings expand into environment variables of: - `TAP_GOOGLE_SHEETS_SHEET_ID` - `TAP_GOOGLE_SHEETS_OUTPUT_NAME` - `TAP_GOOGLE_SHEETS_CHILD_SHEET_NAME` +- `TAP_GOOGLE_SHEETS_RANGE` - `TAP_GOOGLE_SHEETS_KEY_PROPERTIES` - `TAP_GOOGLE_SHEETS_SHEETS` @@ -122,7 +125,7 @@ These settings expand into environment variables of: ## Roadmap -- [ ] Add setting to optionally allow the selection of a range of data from a sheet. (Add an optional range setting). +- [x] Add setting to optionally allow the selection of a range of data from a sheet. (Add an optional range setting). - [ ] Improve default behavior of a sheet with multiple columns of the same name and `target-postgres`. diff --git a/meltano.yml b/meltano.yml index c58c518..c20141b 100644 --- a/meltano.yml +++ b/meltano.yml @@ -25,6 +25,7 @@ plugins: - name: sheet_id - name: output_name - name: child_sheet_name + - name: range - name: key_properties kind: array - name: sheets diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py index dfef997..d4323b7 100644 --- a/tap_google_sheets/streams.py +++ b/tap_google_sheets/streams.py @@ -24,7 +24,11 @@ class GoogleSheetsStream(GoogleSheetsBaseStream): @property def path(self): """Set the path for the stream.""" - return f"/{self.stream_config['sheet_id']}/values/{self.child_sheet_name}" + path = f"/{self.stream_config['sheet_id']}/values/{self.child_sheet_name}" + sheet_range = self.stream_config.get("range") + if sheet_range: + path += f"!{sheet_range}" + return path def parse_response(self, response: requests.Response) -> Iterable[dict]: """Parse response, build response back up into json, update stream schema.""" diff --git a/tap_google_sheets/tap.py b/tap_google_sheets/tap.py index 356ae9d..e029561 100644 --- a/tap_google_sheets/tap.py +++ b/tap_google_sheets/tap.py @@ -6,6 +6,7 @@ import requests from singer_sdk import Stream, Tap from singer_sdk import typing as th +from singer_sdk.exceptions import ConfigValidationError from tap_google_sheets.client import GoogleSheetsBaseStream from tap_google_sheets.streams import GoogleSheetsStream @@ -14,6 +15,16 @@ class TapGoogleSheets(Tap): """google_sheets tap class.""" + a1_allowed_regexp = [ + r"^([A-Za-z]{1,3})(\d{1,7})()()$", # e.g. G8 + r"^([A-Za-z]{1,3})():([A-Za-z]{1,3})()$", # e.g. C:G + r"^()(\d{1,7}):()(\d{1,7})$", # e.g. 1:5 + r"^([A-Za-z]{1,3})(\d{1,7}):()(\d{1,7})$", # e.g. C1:5 + r"^([A-Za-z]{1,3})(\d{1,7}):([A-Za-z]{1,3})()$", # e.g. A1:B + r"^([A-Za-z]{1,3})(\d{1,7}):([A-Za-z]{1,3})(\d{1,7})$", # e.g. C4:G14 + r"^([A-Za-z]{1,3})():([A-Za-z]{1,3})(\d{1,7})$", # e.g. A:B5 + r"^()(\d{1,7}):([A-Za-z]{1,3})(\d{1,7})$", # e.g. 2:B5 + ] name = "tap-google-sheets" per_sheet_config = th.ObjectType( @@ -38,6 +49,16 @@ class TapGoogleSheets(Tap): description="Optionally choose one or more primary key columns", required=False, ), + th.Property( + "range", + th.StringType(), + description=( + "Optionally choose a range of data using cell start and end coordinates" + " - see [A1 notation](https://developers.google.com/sheets/api/guides/concepts#expandable-1)" # noqa: E501 + " for more information" + ), + required=False, + ), ) base_config = th.PropertiesList( @@ -137,6 +158,37 @@ def get_first_visible_child_sheet_name(self, google_sheet_data: requests.Respons return sheet_in_sheet_name + @classmethod + def get_first_line_range(cls, stream_config): + """Get the range of the first line in the Google sheet.""" + sheet_range = stream_config.get("range") + + # when the range is not specified, it will default to the first line and + # short-circuit further evalutation + if sheet_range is None: + return "1:1" + + range_matcher = (re.match(p, sheet_range) for p in cls.a1_allowed_regexp) + + try: + match = next(match for match in range_matcher if match) + except StopIteration as e: + raise ConfigValidationError("Invalid A1 notation for range") from e + + start_column, start_line, end_column, end_line = match.groups("") + + if start_line and end_line: + line_number = min(int(start_line), int(end_line)) + else: + line_number = start_line or end_line or "1" + + # If both end_line and end_column are not specified, use start_column + # it can happen just when the range is single cell e.g "A5" -> "A5:A5" + if not end_column and not end_line: + end_column = start_column + + return f"{start_column}{line_number}:{end_column}{line_number}" + def get_sheet_data(self, stream_config): """Get the data from the selected or first visible sheet in the google sheet.""" config_stream = GoogleSheetsBaseStream( @@ -147,7 +199,8 @@ def get_sheet_data(self, stream_config): + stream_config["sheet_id"] + "/values/" + stream_config.get("child_sheet_name", "") - + "!1:1", + + "!" + + self.get_first_line_range(stream_config), ) prepared_request = config_stream.prepare_request(None, None) diff --git a/tap_google_sheets/tests/test_first_line_range.py b/tap_google_sheets/tests/test_first_line_range.py new file mode 100644 index 0000000..8966001 --- /dev/null +++ b/tap_google_sheets/tests/test_first_line_range.py @@ -0,0 +1,42 @@ +import unittest + +from singer_sdk.exceptions import ConfigValidationError + +from tap_google_sheets.tap import TapGoogleSheets + + +class TestFirstLineRange(unittest.TestCase): + def test_first_line_range_valid(self): + """Test first line range.""" + test_pairs = [ + ("D5", "D5:D5"), + ("1:1", "1:1"), + ("5:8", "5:5"), + ("A1:G", "A1:G1"), + ("A5:G", "A5:G5"), + ("A5:7", "A5:5"), + ("G8:3", "G3:3"), + ("C:G", "C1:G1"), + ("2:B5", "2:B2"), + ("A:B5", "A5:B5"), + ("A6:GE56", "A6:GE6"), + ("A6:K38", "A6:K6"), + ] + for test_input, expected in test_pairs: + stream_config = {"range": test_input} + self.assertEqual( + expected, TapGoogleSheets.get_first_line_range(stream_config) + ) + + def test_invalid_range(self): + """Test invalid range.""" + test_values = ["", "invalid", "A:G:5", "A:", ":G", "5:", ":3", "A:5"] + for test_input in test_values: + stream_config = {"range": test_input} + with self.assertRaises(ConfigValidationError): + TapGoogleSheets.get_first_line_range(stream_config) + + def test_empty_range(self): + """Test empty range.""" + stream_config = {} + self.assertEqual("1:1", TapGoogleSheets.get_first_line_range(stream_config))