diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
index 970c48a..2f73e78 100644
--- a/.github/workflows/ci_workflow.yml
+++ b/.github/workflows/ci_workflow.yml
@@ -12,7 +12,7 @@ jobs:
strategy:
matrix:
# Only lint using the primary version used for dev
- python-version: [3.9]
+ python-version: ['3.9']
steps:
- uses: actions/checkout@v2
@@ -38,7 +38,7 @@ jobs:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
strategy:
matrix:
- python-version: [3.7, 3.8, 3.9]
+ python-version: ['3.7', '3.8', '3.9']
steps:
- uses: actions/checkout@v2
diff --git a/README.md b/README.md
index 4cdd0a6..310c051 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,7 @@ Setting | Required | Type | Description |
`sheet_id` | Required | String | Your target google sheet id
`output_name` | Optional | String | Optionailly rename the stream and output file or table from the tap
`child_sheet_name` | Optional | String | Optionally choose a different sheet from your Google Sheet file
+`range` | Optional | String | Optionally choose a range of data from your Google Sheet file (defaults to the entire sheet)
Range is defined using [A1 notation](https://developers.google.com/sheets/api/guides/concepts#expandable-1) and is start/end inclusive. Examples:
- `B5:G45` - start at `B5` and end at `G45`
- `A:T` - start at `A1` and end at the last cell of column `T` (same as `A1:T` and `A:T1`)
- `3:5` - start at `A3` and end at the last cell of row `5` (same as `A3:5` and `3:A5`)
- `D3:ZZZ` - start at `D3` and end at the last cell in the sheet
`key_properties` | Optional | Array of Strings | Optionally choose primary key column(s) from your Google Sheet file. Example: `["column_one", "column_two"]`
`sheets` | Optional | Array of Objects | Optionally provide a list of configs for each sheet/stream. See "Per Sheet Config" below. Overrides the `sheet_id` provided at the root level.
@@ -74,6 +75,7 @@ Setting | Required | Type | Description |
`sheet_id` | Required | String | Your target google sheet id
`output_name` | Optional | String | Optionailly rename the stream and output file or table from the tap
`child_sheet_name` | Optional | String | Optionally choose a different sheet from your Google Sheet file
+`range` | Optional | String | Optionally choose a range of data from your Google Sheet file (defaults to the entire sheet)
Range is defined using [A1 notation](https://developers.google.com/sheets/api/guides/concepts#expandable-1) and is start/end inclusive. Examples:- `B5:G45` - start at `B5` and end at `G45`
- `A:T` - start at `A1` and end at the last cell of column `T` (same as `A1:T` and `A:T1`)
- `3:5` - start at `A3` and end at the last cell of row `5` (same as `A3:5` and `3:A5`)
- `D3:ZZZ` - start at `D3` and end at the last cell in the sheet
`key_properties` | Optional | Array of Strings | Optionally choose primary key column(s) from your Google Sheet file. Example: `["column_one", "column_two"]`
### Environment Variable
@@ -85,6 +87,7 @@ These settings expand into environment variables of:
- `TAP_GOOGLE_SHEETS_SHEET_ID`
- `TAP_GOOGLE_SHEETS_OUTPUT_NAME`
- `TAP_GOOGLE_SHEETS_CHILD_SHEET_NAME`
+- `TAP_GOOGLE_SHEETS_RANGE`
- `TAP_GOOGLE_SHEETS_KEY_PROPERTIES`
- `TAP_GOOGLE_SHEETS_SHEETS`
@@ -122,7 +125,7 @@ These settings expand into environment variables of:
## Roadmap
-- [ ] Add setting to optionally allow the selection of a range of data from a sheet. (Add an optional range setting).
+- [x] Add setting to optionally allow the selection of a range of data from a sheet. (Add an optional range setting).
- [ ] Improve default behavior of a sheet with multiple columns of the same name and `target-postgres`.
diff --git a/meltano.yml b/meltano.yml
index c58c518..c20141b 100644
--- a/meltano.yml
+++ b/meltano.yml
@@ -25,6 +25,7 @@ plugins:
- name: sheet_id
- name: output_name
- name: child_sheet_name
+ - name: range
- name: key_properties
kind: array
- name: sheets
diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py
index dfef997..d4323b7 100644
--- a/tap_google_sheets/streams.py
+++ b/tap_google_sheets/streams.py
@@ -24,7 +24,11 @@ class GoogleSheetsStream(GoogleSheetsBaseStream):
@property
def path(self):
"""Set the path for the stream."""
- return f"/{self.stream_config['sheet_id']}/values/{self.child_sheet_name}"
+ path = f"/{self.stream_config['sheet_id']}/values/{self.child_sheet_name}"
+ sheet_range = self.stream_config.get("range")
+ if sheet_range:
+ path += f"!{sheet_range}"
+ return path
def parse_response(self, response: requests.Response) -> Iterable[dict]:
"""Parse response, build response back up into json, update stream schema."""
diff --git a/tap_google_sheets/tap.py b/tap_google_sheets/tap.py
index 356ae9d..e029561 100644
--- a/tap_google_sheets/tap.py
+++ b/tap_google_sheets/tap.py
@@ -6,6 +6,7 @@
import requests
from singer_sdk import Stream, Tap
from singer_sdk import typing as th
+from singer_sdk.exceptions import ConfigValidationError
from tap_google_sheets.client import GoogleSheetsBaseStream
from tap_google_sheets.streams import GoogleSheetsStream
@@ -14,6 +15,16 @@
class TapGoogleSheets(Tap):
"""google_sheets tap class."""
+ a1_allowed_regexp = [
+ r"^([A-Za-z]{1,3})(\d{1,7})()()$", # e.g. G8
+ r"^([A-Za-z]{1,3})():([A-Za-z]{1,3})()$", # e.g. C:G
+ r"^()(\d{1,7}):()(\d{1,7})$", # e.g. 1:5
+ r"^([A-Za-z]{1,3})(\d{1,7}):()(\d{1,7})$", # e.g. C1:5
+ r"^([A-Za-z]{1,3})(\d{1,7}):([A-Za-z]{1,3})()$", # e.g. A1:B
+ r"^([A-Za-z]{1,3})(\d{1,7}):([A-Za-z]{1,3})(\d{1,7})$", # e.g. C4:G14
+ r"^([A-Za-z]{1,3})():([A-Za-z]{1,3})(\d{1,7})$", # e.g. A:B5
+ r"^()(\d{1,7}):([A-Za-z]{1,3})(\d{1,7})$", # e.g. 2:B5
+ ]
name = "tap-google-sheets"
per_sheet_config = th.ObjectType(
@@ -38,6 +49,16 @@ class TapGoogleSheets(Tap):
description="Optionally choose one or more primary key columns",
required=False,
),
+ th.Property(
+ "range",
+ th.StringType(),
+ description=(
+ "Optionally choose a range of data using cell start and end coordinates"
+ " - see [A1 notation](https://developers.google.com/sheets/api/guides/concepts#expandable-1)" # noqa: E501
+ " for more information"
+ ),
+ required=False,
+ ),
)
base_config = th.PropertiesList(
@@ -137,6 +158,37 @@ def get_first_visible_child_sheet_name(self, google_sheet_data: requests.Respons
return sheet_in_sheet_name
+ @classmethod
+ def get_first_line_range(cls, stream_config):
+ """Get the range of the first line in the Google sheet."""
+ sheet_range = stream_config.get("range")
+
+ # when the range is not specified, it will default to the first line and
+ # short-circuit further evalutation
+ if sheet_range is None:
+ return "1:1"
+
+ range_matcher = (re.match(p, sheet_range) for p in cls.a1_allowed_regexp)
+
+ try:
+ match = next(match for match in range_matcher if match)
+ except StopIteration as e:
+ raise ConfigValidationError("Invalid A1 notation for range") from e
+
+ start_column, start_line, end_column, end_line = match.groups("")
+
+ if start_line and end_line:
+ line_number = min(int(start_line), int(end_line))
+ else:
+ line_number = start_line or end_line or "1"
+
+ # If both end_line and end_column are not specified, use start_column
+ # it can happen just when the range is single cell e.g "A5" -> "A5:A5"
+ if not end_column and not end_line:
+ end_column = start_column
+
+ return f"{start_column}{line_number}:{end_column}{line_number}"
+
def get_sheet_data(self, stream_config):
"""Get the data from the selected or first visible sheet in the google sheet."""
config_stream = GoogleSheetsBaseStream(
@@ -147,7 +199,8 @@ def get_sheet_data(self, stream_config):
+ stream_config["sheet_id"]
+ "/values/"
+ stream_config.get("child_sheet_name", "")
- + "!1:1",
+ + "!"
+ + self.get_first_line_range(stream_config),
)
prepared_request = config_stream.prepare_request(None, None)
diff --git a/tap_google_sheets/tests/test_first_line_range.py b/tap_google_sheets/tests/test_first_line_range.py
new file mode 100644
index 0000000..8966001
--- /dev/null
+++ b/tap_google_sheets/tests/test_first_line_range.py
@@ -0,0 +1,42 @@
+import unittest
+
+from singer_sdk.exceptions import ConfigValidationError
+
+from tap_google_sheets.tap import TapGoogleSheets
+
+
+class TestFirstLineRange(unittest.TestCase):
+ def test_first_line_range_valid(self):
+ """Test first line range."""
+ test_pairs = [
+ ("D5", "D5:D5"),
+ ("1:1", "1:1"),
+ ("5:8", "5:5"),
+ ("A1:G", "A1:G1"),
+ ("A5:G", "A5:G5"),
+ ("A5:7", "A5:5"),
+ ("G8:3", "G3:3"),
+ ("C:G", "C1:G1"),
+ ("2:B5", "2:B2"),
+ ("A:B5", "A5:B5"),
+ ("A6:GE56", "A6:GE6"),
+ ("A6:K38", "A6:K6"),
+ ]
+ for test_input, expected in test_pairs:
+ stream_config = {"range": test_input}
+ self.assertEqual(
+ expected, TapGoogleSheets.get_first_line_range(stream_config)
+ )
+
+ def test_invalid_range(self):
+ """Test invalid range."""
+ test_values = ["", "invalid", "A:G:5", "A:", ":G", "5:", ":3", "A:5"]
+ for test_input in test_values:
+ stream_config = {"range": test_input}
+ with self.assertRaises(ConfigValidationError):
+ TapGoogleSheets.get_first_line_range(stream_config)
+
+ def test_empty_range(self):
+ """Test empty range."""
+ stream_config = {}
+ self.assertEqual("1:1", TapGoogleSheets.get_first_line_range(stream_config))