diff --git a/README.md b/README.md index 9e6ab68..68c6ac7 100644 --- a/README.md +++ b/README.md @@ -237,51 +237,51 @@ develop your own Singer taps and targets. The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes. -| jsonschema | Postgres | -|--------------------------------|-----------------------------------------| -| integer | bigint | -| UNSUPPORTED | bigserial | -| UNSUPPORTED | bit [ (n) ] | -| UNSUPPORTED | bit varying [ (n) ] | -| boolean | boolean | -| UNSUPPORTED | box | +| jsonschema | Postgres | +| ---------------------------------------------------------------------------------- | --------------------------------------- | +| integer | bigint | +| integer with minimum >= 32768 or maximum < 32768 | smallint | +| integer with minimum >= 2147483648 or maximum < 2147483648 | integer | +| UNSUPPORTED | bigserial | +| UNSUPPORTED | bit [ (n) ] | +| UNSUPPORTED | bit varying [ (n) ] | +| boolean | boolean | +| UNSUPPORTED | box | | string with contentEncoding="base16" ([opt-in feature](#content-encoding-support)) | bytea | -| UNSUPPORTED | character [ (n) ] | -| UNSUPPORTED | character varying [ (n) ] | -| UNSUPPORTED | cidr | -| UNSUPPORTED | circle | -| string with format="date" | date | -| UNSUPPORTED | double precision | -| UNSUPPORTED | inet | -| UNSUPPORTED | integer | -| UNSUPPORTED | interval [ fields ] [ (p) ] | -| UNSUPPORTED | json | -| array; object | jsonb | -| UNSUPPORTED | line | -| UNSUPPORTED | lseg | -| UNSUPPORTED | macaddr | -| UNSUPPORTED | macaddr8 | -| UNSUPPORTED | money | -| number | numeric [ (p, s) ] | -| UNSUPPORTED | path | -| UNSUPPORTED | pg_lsn | -| UNSUPPORTED | pg_snapshot | -| UNSUPPORTED | point | -| UNSUPPORTED | polygon | -| UNSUPPORTED | real | -| UNSUPPORTED | smallint | -| UNSUPPORTED | smallserial | -| UNSUPPORTED | serial | -| string without format; untyped | text | -| string with format="time" | time [ (p) ] [ without time zone ] | -| UNSUPPORTED | time [ (p) ] with time zone | -| string with format="date-time" | timestamp [ (p) ] [ without time zone ] | -| UNSUPPORTED | timestamp [ (p) ] with time zone | -| UNSUPPORTED | tsquery | -| UNSUPPORTED | tsvector | -| UNSUPPORTED | txid_snapshot | -| string with format="uuid" | uuid | -| UNSUPPORTED | xml | +| UNSUPPORTED | character [ (n) ] | +| UNSUPPORTED | character varying [ (n) ] | +| UNSUPPORTED | cidr | +| UNSUPPORTED | circle | +| string with format="date" | date | +| UNSUPPORTED | double precision | +| UNSUPPORTED | inet | +| UNSUPPORTED | interval [ fields ] [ (p) ] | +| UNSUPPORTED | json | +| array; object | jsonb | +| UNSUPPORTED | line | +| UNSUPPORTED | lseg | +| UNSUPPORTED | macaddr | +| UNSUPPORTED | macaddr8 | +| UNSUPPORTED | money | +| number | numeric [ (p, s) ] | +| UNSUPPORTED | path | +| UNSUPPORTED | pg_lsn | +| UNSUPPORTED | pg_snapshot | +| UNSUPPORTED | point | +| UNSUPPORTED | polygon | +| UNSUPPORTED | real | +| UNSUPPORTED | smallserial | +| UNSUPPORTED | serial | +| string without format; untyped | text | +| string with format="time" | time [ (p) ] [ without time zone ] | +| UNSUPPORTED | time [ (p) ] with time zone | +| string with format="date-time" | timestamp [ (p) ] [ without time zone ] | +| UNSUPPORTED | timestamp [ (p) ] with time zone | +| UNSUPPORTED | tsquery | +| UNSUPPORTED | tsvector | +| UNSUPPORTED | txid_snapshot | +| string with format="uuid" | uuid | +| UNSUPPORTED | xml | Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array. @@ -297,9 +297,28 @@ If a column has multiple jsonschema types, the following order is using to order - DECIMAL - BIGINT - INTEGER +- SMALLINT - BOOLEAN - NOTYPE +### Using the Singer catalog to narrow down the Postgres data types + +You can use [Singer catalog's schema](https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#schemas) to override the data types coming from the tap. The easiest way to do this is to use Meltano and its [`schema` setting](https://docs.meltano.com/concepts/plugins/#schema-extra) for the tap: + +```yaml +# meltano.yml +plugins: + extractors: + - name: tap-my-tap + schema: + some_stream_id: + my_column: + type: integer + # This will be mapped to 'smallint' + minimum: 0 + maximum: 1000 +``` + ## Content Encoding Support Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types. diff --git a/poetry.lock b/poetry.lock index a616ff9..ce6ec06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -735,6 +735,8 @@ optional = false python-versions = "*" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, + {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, + {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -1402,29 +1404,29 @@ files = [ [[package]] name = "ruff" -version = "0.7.4" +version = "0.8.0" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.7.4-py3-none-linux_armv6l.whl", hash = "sha256:a4919925e7684a3f18e18243cd6bea7cfb8e968a6eaa8437971f681b7ec51478"}, - {file = "ruff-0.7.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:cfb365c135b830778dda8c04fb7d4280ed0b984e1aec27f574445231e20d6c63"}, - {file = "ruff-0.7.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:63a569b36bc66fbadec5beaa539dd81e0527cb258b94e29e0531ce41bacc1f20"}, - {file = "ruff-0.7.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d06218747d361d06fd2fdac734e7fa92df36df93035db3dc2ad7aa9852cb109"}, - {file = "ruff-0.7.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e0cea28d0944f74ebc33e9f934238f15c758841f9f5edd180b5315c203293452"}, - {file = "ruff-0.7.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80094ecd4793c68b2571b128f91754d60f692d64bc0d7272ec9197fdd09bf9ea"}, - {file = "ruff-0.7.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:997512325c6620d1c4c2b15db49ef59543ef9cd0f4aa8065ec2ae5103cedc7e7"}, - {file = "ruff-0.7.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:00b4cf3a6b5fad6d1a66e7574d78956bbd09abfd6c8a997798f01f5da3d46a05"}, - {file = "ruff-0.7.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7dbdc7d8274e1422722933d1edddfdc65b4336abf0b16dfcb9dedd6e6a517d06"}, - {file = "ruff-0.7.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e92dfb5f00eaedb1501b2f906ccabfd67b2355bdf117fea9719fc99ac2145bc"}, - {file = "ruff-0.7.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:3bd726099f277d735dc38900b6a8d6cf070f80828877941983a57bca1cd92172"}, - {file = "ruff-0.7.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:2e32829c429dd081ee5ba39aef436603e5b22335c3d3fff013cd585806a6486a"}, - {file = "ruff-0.7.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:662a63b4971807623f6f90c1fb664613f67cc182dc4d991471c23c541fee62dd"}, - {file = "ruff-0.7.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:876f5e09eaae3eb76814c1d3b68879891d6fde4824c015d48e7a7da4cf066a3a"}, - {file = "ruff-0.7.4-py3-none-win32.whl", hash = "sha256:75c53f54904be42dd52a548728a5b572344b50d9b2873d13a3f8c5e3b91f5cac"}, - {file = "ruff-0.7.4-py3-none-win_amd64.whl", hash = "sha256:745775c7b39f914238ed1f1b0bebed0b9155a17cd8bc0b08d3c87e4703b990d6"}, - {file = "ruff-0.7.4-py3-none-win_arm64.whl", hash = "sha256:11bff065102c3ae9d3ea4dc9ecdfe5a5171349cdd0787c1fc64761212fc9cf1f"}, - {file = "ruff-0.7.4.tar.gz", hash = "sha256:cd12e35031f5af6b9b93715d8c4f40360070b2041f81273d0527683d5708fce2"}, + {file = "ruff-0.8.0-py3-none-linux_armv6l.whl", hash = "sha256:fcb1bf2cc6706adae9d79c8d86478677e3bbd4ced796ccad106fd4776d395fea"}, + {file = "ruff-0.8.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:295bb4c02d58ff2ef4378a1870c20af30723013f441c9d1637a008baaf928c8b"}, + {file = "ruff-0.8.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7b1f1c76b47c18fa92ee78b60d2d20d7e866c55ee603e7d19c1e991fad933a9a"}, + {file = "ruff-0.8.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb0d4f250a7711b67ad513fde67e8870109e5ce590a801c3722580fe98c33a99"}, + {file = "ruff-0.8.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e55cce9aa93c5d0d4e3937e47b169035c7e91c8655b0974e61bb79cf398d49c"}, + {file = "ruff-0.8.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f4cd64916d8e732ce6b87f3f5296a8942d285bbbc161acee7fe561134af64f9"}, + {file = "ruff-0.8.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c5c1466be2a2ebdf7c5450dd5d980cc87c8ba6976fb82582fea18823da6fa362"}, + {file = "ruff-0.8.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2dabfd05b96b7b8f2da00d53c514eea842bff83e41e1cceb08ae1966254a51df"}, + {file = "ruff-0.8.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:facebdfe5a5af6b1588a1d26d170635ead6892d0e314477e80256ef4a8470cf3"}, + {file = "ruff-0.8.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87a8e86bae0dbd749c815211ca11e3a7bd559b9710746c559ed63106d382bd9c"}, + {file = "ruff-0.8.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:85e654f0ded7befe2d61eeaf3d3b1e4ef3894469cd664ffa85006c7720f1e4a2"}, + {file = "ruff-0.8.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:83a55679c4cb449fa527b8497cadf54f076603cc36779b2170b24f704171ce70"}, + {file = "ruff-0.8.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:812e2052121634cf13cd6fddf0c1871d0ead1aad40a1a258753c04c18bb71bbd"}, + {file = "ruff-0.8.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:780d5d8523c04202184405e60c98d7595bdb498c3c6abba3b6d4cdf2ca2af426"}, + {file = "ruff-0.8.0-py3-none-win32.whl", hash = "sha256:5fdb6efecc3eb60bba5819679466471fd7d13c53487df7248d6e27146e985468"}, + {file = "ruff-0.8.0-py3-none-win_amd64.whl", hash = "sha256:582891c57b96228d146725975fbb942e1f30a0c4ba19722e692ca3eb25cc9b4f"}, + {file = "ruff-0.8.0-py3-none-win_arm64.whl", hash = "sha256:ba93e6294e9a737cd726b74b09a6972e36bb511f9a102f1d9a7e1ce94dd206a6"}, + {file = "ruff-0.8.0.tar.gz", hash = "sha256:a7ccfe6331bf8c8dad715753e157457faf7351c2b69f62f32c165c2dbcbacd44"}, ] [[package]] @@ -1869,13 +1871,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.27.1" +version = "20.28.0" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" files = [ - {file = "virtualenv-20.27.1-py3-none-any.whl", hash = "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4"}, - {file = "virtualenv-20.27.1.tar.gz", hash = "sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba"}, + {file = "virtualenv-20.28.0-py3-none-any.whl", hash = "sha256:23eae1b4516ecd610481eda647f3a7c09aea295055337331bb4e6892ecce47b0"}, + {file = "virtualenv-20.28.0.tar.gz", hash = "sha256:2c9c3262bb8e7b87ea801d715fae4495e6032450c71d2309be9550e7364049aa"}, ] [package.dependencies] diff --git a/target_postgres/connector.py b/target_postgres/connector.py index f627876..1131f0c 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -5,6 +5,7 @@ import atexit import io import itertools +import math import signal import sys import typing as t @@ -18,7 +19,14 @@ import sqlalchemy as sa from singer_sdk import SQLConnector from singer_sdk.connectors.sql import JSONSchemaToSQL -from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, BYTEA, JSONB, UUID +from sqlalchemy.dialects.postgresql import ( + ARRAY, + BIGINT, + BYTEA, + JSONB, + SMALLINT, + UUID, +) from sqlalchemy.engine import URL from sqlalchemy.engine.url import make_url from sqlalchemy.types import ( @@ -255,12 +263,23 @@ def _handle_array_type(self, jsonschema: dict) -> ARRAY | JSONB: # Case 3: tuples return ARRAY(JSONB()) if isinstance(items, list) else JSONB() + def _handle_integer_type(self, jsonschema: dict) -> SMALLINT | INTEGER | BIGINT: + """Handle integer type.""" + minimum = jsonschema.get("minimum", -math.inf) + maximum = jsonschema.get("maximum", math.inf) + if minimum >= -(2**15) and maximum < 2**15: + return SMALLINT() + if minimum >= -(2**31) and maximum < 2**31: + return INTEGER() + + return BIGINT() + @cached_property def jsonschema_to_sql(self) -> JSONSchemaToSQL: """Return a JSONSchemaToSQL instance with custom type handling.""" to_sql = JSONSchemaToPostgres(content_encoding=self.interpret_content_encoding) to_sql.fallback_type = TEXT - to_sql.register_type_handler("integer", BIGINT) + to_sql.register_type_handler("integer", self._handle_integer_type) to_sql.register_type_handler("object", JSONB) to_sql.register_type_handler("array", self._handle_array_type) to_sql.register_format_handler("date-time", TIMESTAMP) @@ -365,6 +384,7 @@ def pick_best_sql_type(sql_type_array: list): DECIMAL, BIGINT, INTEGER, + SMALLINT, BOOLEAN, NOTYPE, ] diff --git a/target_postgres/tests/test_types.py b/target_postgres/tests/test_types.py index 8741d5f..065ca32 100644 --- a/target_postgres/tests/test_types.py +++ b/target_postgres/tests/test_types.py @@ -2,8 +2,9 @@ import pytest import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import BIGINT, SMALLINT -from target_postgres.connector import NOTYPE, PostgresConnector +from target_postgres.connector import NOTYPE, JSONSchemaToPostgres, PostgresConnector @pytest.fixture @@ -36,3 +37,70 @@ def connector(): def test_type_hierarchy(connector, types, expected): """Test that types are merged correctly.""" assert type(connector.merge_sql_types(types)) is expected + + +class TestJSONSchemaToPostgres: + """Test JSONSchemaToPostgres class.""" + + @pytest.fixture + def to_postgres(self, connector: PostgresConnector): + """Create a JSONSchemaToPostgres instance.""" + return connector.jsonschema_to_sql + + def test_datetime_string(self, to_postgres: JSONSchemaToPostgres): + """Test conversion of JSON schema string to Postgres datetime.""" + result = to_postgres.to_sql_type({"type": "string", "format": "date-time"}) + assert type(result) is sa.TIMESTAMP + + @pytest.mark.parametrize( + ("jsonschema", "expected"), + [ + pytest.param({"type": "integer"}, BIGINT, id="default"), + pytest.param({"type": ["integer", "null"]}, BIGINT, id="default-nullable"), + pytest.param( + { + "type": "integer", + "minimum": 0, + "maximum": 2**15 - 1, + }, + SMALLINT, + id="smallint", + ), + pytest.param( + { + "type": "integer", + "minimum": -5, + "maximum": 5, + }, + SMALLINT, + id="negative-smallint", + ), + pytest.param( + { + "type": "integer", + "minimum": 0, + "maximum": 2**31 - 1, + }, + sa.INTEGER, + id="integer", + ), + pytest.param( + { + "type": "integer", + "minimum": 0, + "maximum": 2**31 + 1, + }, + BIGINT, + id="bigint", + ), + ], + ) + def test_integers( + self, + to_postgres: JSONSchemaToPostgres, + jsonschema: dict, + expected: type[sa.types.TypeEngine], + ): + """Test conversion of JSON schema types to Postgres types.""" + result = to_postgres.to_sql_type(jsonschema) + assert type(result) is expected