From 0a200a35ea4a21ca784c4b02e8503a90640dfa88 Mon Sep 17 00:00:00 2001
From: Samira-El <54845154+Samira-El@users.noreply.github.com>
Date: Tue, 7 Jan 2020 12:20:18 +0200
Subject: [PATCH] 1.1.8: create binary column for the binary data (#53)

* 1.1.8: create binary column for the binary data

* bump minor

* update test to get rid of repeated input
---
 setup.py                                      |   2 +-
 target_snowflake/db_sync.py                   |   4 +
 .../messages-with-binary-columns.json         |  16 +++
 tests/integration/test_target_snowflake.py    |  32 +++++
 tests/unit/test_db_sync.py                    | 126 +++++++++++-------
 5 files changed, 133 insertions(+), 47 deletions(-)
 create mode 100644 tests/integration/resources/messages-with-binary-columns.json

diff --git a/setup.py b/setup.py
index 42b39460..82e81429 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
       long_description = f.read()
 
 setup(name="pipelinewise-target-snowflake",
-      version="1.1.7",
+      version="1.2.0",
       description="Singer.io target for loading data to Snowflake - PipelineWise compatible",
       long_description=long_description,
       long_description_content_type='text/markdown',
diff --git a/target_snowflake/db_sync.py b/target_snowflake/db_sync.py
index b0fa4217..143d0ae1 100644
--- a/target_snowflake/db_sync.py
+++ b/target_snowflake/db_sync.py
@@ -65,6 +65,8 @@ def column_type(schema_property):
         column_type = 'timestamp_ntz'
     elif property_format == 'time':
         column_type = 'time'
+    elif property_format == 'binary':
+        column_type = 'binary'
     elif 'number' in property_type:
         column_type = 'float'
     elif 'integer' in property_type and 'string' in property_type:
@@ -82,6 +84,8 @@ def column_trans(schema_property):
     column_trans = ''
     if 'object' in property_type or 'array' in property_type:
         column_trans = 'parse_json'
+    elif schema_property.get('format') == 'binary':
+        column_trans = 'to_binary'
 
     return column_trans
 
diff --git a/tests/integration/resources/messages-with-binary-columns.json b/tests/integration/resources/messages-with-binary-columns.json
new file mode 100644
index 00000000..2a22f313
--- /dev/null
+++ b/tests/integration/resources/messages-with-binary-columns.json
@@ -0,0 +1,16 @@
+{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_binary"}}
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_binary", "schema": {"properties": {"data": {"inclusion": "available", "format": "binary", "type": ["null", "string"]}, "id": {"inclusion": "automatic", "format": "binary", "type": ["null", "string"]}, "created_at": {"inclusion": "available", "format": "date-time", "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["id"]}
+{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_binary", "version": 1576670613163}
+{"type": "RECORD", "stream": "tap_mysql_test-test_binary", "record": {"data": "6461746132", "id": "706b32", "created_at": "2019-12-17T16:02:55+00:00"}, "version": 1576670613163, "time_extracted": "2019-12-18T12:03:33.174343Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_binary", "record": {"data": "64617461313030", "id": "706b33", "created_at": "2019-12-18T11:46:38+00:00"}, "version": 1576670613163, "time_extracted": "2019-12-18T12:03:33.174343Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_binary", "record": {"data": "6461746134", "id": "706b34", "created_at": "2019-12-17T16:32:22+00:00"}, "version": 1576670613163, "time_extracted": "2019-12-18T12:03:33.174343Z"}
+{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_binary", "bookmarks": {"tap_mysql_test-test_binary": {"version": 1576670613163}}}}
+{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_binary", "version": 1576670613163}
+{"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"tap_mysql_test-test_binary": {"version": 1576670613163, "log_file": "mysql-bin.000004", "log_pos": 945}}}}
+{"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"tap_mysql_test-test_binary": {"version": 1576670613163, "log_file": "mysql-bin.000004", "log_pos": 945}}}}
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_binary", "schema": {"properties": {"data": {"inclusion": "available", "format": "binary", "type": ["null", "string"]}, "created_at": {"inclusion": "available", "format": "date-time", "type": ["null", "string"]}, "id": {"inclusion": "automatic", "format": "binary", "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["id"]}
+{"type": "RECORD", "stream": "tap_mysql_test-test_binary", "record": {"id": "706b35", "data": "6461746135", "created_at": "2019-12-18T13:19:20+00:00"}, "version": 1576670613163, "time_extracted": "2019-12-18T13:24:31.441849Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_binary", "record": {"id": "706b35", "data": "64617461313030", "created_at": "2019-12-18T13:19:35+00:00"}, "version": 1576670613163, "time_extracted": "2019-12-18T13:24:31.441849Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_binary", "record": {"id": "706b33", "data": "64617461313030", "created_at": "2019-12-18T11:46:38+00:00", "_sdc_deleted_at": "2019-12-18T13:19:44+00:00+00:00"}, "version": 1576670613163, "time_extracted": "2019-12-18T13:24:31.441849Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_binary", "record": {"id": "706b35", "data": "64617461313030", "created_at": "2019-12-18T13:19:35+00:00", "_sdc_deleted_at": "2019-12-18T13:19:44+00:00+00:00"}, "version": 1576670613163, "time_extracted": "2019-12-18T13:24:31.441849Z"}
+{"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"tap_mysql_test-test_binary": {"version": 1576670613163, "log_file": "mysql-bin.000004", "log_pos": 1867}}}}
diff --git a/tests/integration/test_target_snowflake.py b/tests/integration/test_target_snowflake.py
index 57786964..4fc0c2bf 100644
--- a/tests/integration/test_target_snowflake.py
+++ b/tests/integration/test_target_snowflake.py
@@ -246,6 +246,25 @@ def assert_logical_streams_are_in_snowflake_and_are_empty(self):
         self.assertEqual(table_three, [])
         self.assertEqual(table_four, [])
 
+    def assert_binary_data_are_in_snowflake(self, should_metadata_columns_exist=False):
+        # Get loaded rows from tables
+        snowflake = DbSync(self.config)
+        target_schema = self.config.get('default_target_schema', '')
+        table_one = snowflake.query("SELECT * FROM {}.test_binary ORDER BY ID".format(target_schema))
+
+        # ----------------------------------------------------------------------
+        # Check rows in table_one
+        # ----------------------------------------------------------------------
+        expected_table_one = [
+            {'ID': b'pk2', 'DATA': b'data2', 'CREATED_AT': datetime.datetime(2019, 12, 17, 16, 2, 55)},
+            {'ID': b'pk4', 'DATA': b'data4', "CREATED_AT": datetime.datetime(2019, 12, 17, 16, 32, 22)},
+        ]
+
+        if should_metadata_columns_exist:
+            self.assertEqual(self.remove_metadata_columns_from_rows(table_one), expected_table_one)
+        else:
+            self.assertEqual(table_one, expected_table_one)
+
     #################################
     #           TESTS               #
     #################################
@@ -340,6 +359,19 @@ def test_loading_with_multiple_schema(self):
             should_hard_deleted_rows=False
         )
 
+    def test_loading_tables_with_binary_columns_and_hard_delete(self):
+        """Loading multiple tables from the same input tap with deleted rows"""
+        tap_lines = test_utils.get_test_tap_lines('messages-with-binary-columns.json')
+
+        # Turning on hard delete mode
+        self.config['hard_delete'] = True
+        self.persist_lines_with_cache(tap_lines)
+
+        # Check if data loaded correctly and metadata columns exist
+        self.assert_binary_data_are_in_snowflake(
+            should_metadata_columns_exist=True
+        )
+
     def test_loading_unicode_characters(self):
         """Loading unicode encoded characters"""
         tap_lines = test_utils.get_test_tap_lines('messages-with-unicode-characters.json')
diff --git a/tests/unit/test_db_sync.py b/tests/unit/test_db_sync.py
index 5276809e..aa5905b6 100644
--- a/tests/unit/test_db_sync.py
+++ b/tests/unit/test_db_sync.py
@@ -11,6 +11,22 @@ class TestDBSync(unittest.TestCase):
     def setUp(self):
         self.config = {}
 
+        self.json_types = {
+            'str': {"type": ["string"]},
+            'str_or_null': {"type": ["string", "null"]},
+            'dt': {"type": ["string"], "format": "date-time"},
+            'dt_or_null': {"type": ["string", "null"], "format": "date-time"},
+            'time': {"type": ["string"], "format": "time"},
+            'time_or_null': {"type": ["string", "null"], "format": "time"},
+            'binary': {"type": ["string", "null"], "format": "binary"},
+            'num': {"type": ["number"]},
+            'int': {"type": ["integer"]},
+            'int_or_str': {"type": ["integer", "string"]},
+            'bool': {"type": ["boolean"]},
+            'obj': {"type": ["object"]},
+            'arr': {"type": ["array"]},
+        }
+
     def test_config_validation(self):
         """Test configuration validator"""
         validator = db_sync.validate_config
@@ -57,63 +73,81 @@ def test_column_type_mapping(self):
         """Test JSON type to Snowflake column type mappings"""
         mapper = db_sync.column_type
 
-        # Incoming JSON schema types
-        json_str = {"type": ["string"]}
-        json_str_or_null = {"type": ["string", "null"]}
-        json_dt = {"type": ["string"], "format": "date-time"}
-        json_dt_or_null = {"type": ["string", "null"], "format": "date-time"}
-        json_t = {"type": ["string"], "format": "time"}
-        json_t_or_null = {"type": ["string", "null"], "format": "time"}
-        json_num = {"type": ["number"]}
-        json_int = {"type": ["integer"]}
-        json_int_or_str = {"type": ["integer", "string"]}
-        json_bool = {"type": ["boolean"]}
-        json_obj = {"type": ["object"]}
-        json_arr = {"type": ["array"]}
-
-        # Mapping from JSON schema types ot Snowflake column types
-        self.assertEquals(mapper(json_str), 'text')
-        self.assertEquals(mapper(json_str_or_null), 'text')
-        self.assertEquals(mapper(json_dt), 'timestamp_ntz')
-        self.assertEquals(mapper(json_dt_or_null), 'timestamp_ntz')
-        self.assertEquals(mapper(json_t), 'time')
-        self.assertEquals(mapper(json_t_or_null), 'time')
-        self.assertEquals(mapper(json_num), 'float')
-        self.assertEquals(mapper(json_int), 'number')
-        self.assertEquals(mapper(json_int_or_str), 'text')
-        self.assertEquals(mapper(json_bool), 'boolean')
-        self.assertEquals(mapper(json_obj), 'variant')
-        self.assertEquals(mapper(json_arr), 'variant')
+        # Snowflake column types
+        sf_types = {
+            'str': 'text',
+            'str_or_null': 'text',
+            'dt': 'timestamp_ntz',
+            'dt_or_null': 'timestamp_ntz',
+            'time': 'time',
+            'time_or_null': 'time',
+            'binary': 'binary',
+            'num': 'float',
+            'int': 'number',
+            'int_or_str': 'text',
+            'bool': 'boolean',
+            'obj': 'variant',
+            'arr': 'variant',
+        }
+
+        # Mapping from JSON schema types to Snowflake column types
+        for key, val in self.json_types.items():
+            self.assertEqual(mapper(val), sf_types[key])
+
+    def test_column_trans(self):
+        """Test column transformation"""
+        trans = db_sync.column_trans
+
+        # Snowflake column transformations
+        sf_trans = {
+            'str': '',
+            'str_or_null': '',
+            'dt': '',
+            'dt_or_null': '',
+            'time': '',
+            'time_or_null': '',
+            'binary': 'to_binary',
+            'num': '',
+            'int': '',
+            'int_or_str': '',
+            'bool': '',
+            'obj': 'parse_json',
+            'arr': 'parse_json',
+        }
+
+        # Getting transformations for every JSON type
+        for key, val in self.json_types.items():
+            self.assertEqual(trans(val), sf_trans[key])
 
     def test_stream_name_to_dict(self):
         """Test identifying catalog, schema and table names from fully qualified stream and table names"""
         # Singer stream name format (Default '-' separator)
-        self.assertEquals(
+        self.assertEqual(
             db_sync.stream_name_to_dict('my_table'),
             {"catalog_name": None, "schema_name": None, "table_name": "my_table"})
 
         # Singer stream name format (Default '-' separator)
-        self.assertEquals(
+        self.assertEqual(
             db_sync.stream_name_to_dict('my_schema-my_table'),
             {"catalog_name": None, "schema_name": "my_schema", "table_name": "my_table"})
 
         # Singer stream name format (Default '-' separator)
-        self.assertEquals(
+        self.assertEqual(
             db_sync.stream_name_to_dict('my_catalog-my_schema-my_table'),
             {"catalog_name": "my_catalog", "schema_name": "my_schema", "table_name": "my_table"})
 
         # Snowflake table format (Custom '.' separator)
-        self.assertEquals(
+        self.assertEqual(
             db_sync.stream_name_to_dict('my_table', separator='.'),
             {"catalog_name": None, "schema_name": None, "table_name": "my_table"})
 
         # Snowflake table format (Custom '.' separator)
-        self.assertEquals(
+        self.assertEqual(
             db_sync.stream_name_to_dict('my_schema.my_table', separator='.'),
             {"catalog_name": None, "schema_name": "my_schema", "table_name": "my_table"})
 
         # Snowflake table format (Custom '.' separator)
-        self.assertEquals(
+        self.assertEqual(
             db_sync.stream_name_to_dict('my_catalog.my_schema.my_table', separator='.'),
             {"catalog_name": "my_catalog", "schema_name": "my_schema", "table_name": "my_table"})
 
@@ -123,7 +157,7 @@ def test_flatten_schema(self):
 
         # Schema with no object properties should be empty dict
         schema_with_no_properties = {"type": "object"}
-        self.assertEquals(flatten_schema(schema_with_no_properties), {})
+        self.assertEqual(flatten_schema(schema_with_no_properties), {})
 
         not_nested_schema = {
             "type": "object",
@@ -133,7 +167,7 @@ def test_flatten_schema(self):
                 "c_int": {"type": ["null", "integer"]}}}
 
         # NO FLATTENING - Schema with simple properties should be a plain dictionary
-        self.assertEquals(flatten_schema(not_nested_schema), not_nested_schema['properties'])
+        self.assertEqual(flatten_schema(not_nested_schema), not_nested_schema['properties'])
 
         nested_schema_with_no_properties = {
             "type": "object",
@@ -144,7 +178,7 @@ def test_flatten_schema(self):
                 "c_obj": {"type": ["null", "object"]}}}
 
         # NO FLATTENING - Schema with object type property but without further properties should be a plain dictionary
-        self.assertEquals(flatten_schema(nested_schema_with_no_properties),
+        self.assertEqual(flatten_schema(nested_schema_with_no_properties),
                           nested_schema_with_no_properties['properties'])
 
         nested_schema_with_properties = {
@@ -172,16 +206,16 @@ def test_flatten_schema(self):
 
         # NO FLATTENING - Schema with object type property but without further properties should be a plain dictionary
         # No flattening (default)
-        self.assertEquals(flatten_schema(nested_schema_with_properties), nested_schema_with_properties['properties'])
+        self.assertEqual(flatten_schema(nested_schema_with_properties), nested_schema_with_properties['properties'])
 
         # NO FLATTENING - Schema with object type property but without further properties should be a plain dictionary
         #   max_level: 0 : No flattening (default)
-        self.assertEquals(flatten_schema(nested_schema_with_properties, max_level=0),
+        self.assertEqual(flatten_schema(nested_schema_with_properties, max_level=0),
                           nested_schema_with_properties['properties'])
 
         # FLATTENING - Schema with object type property but without further properties should be a dict with
         # flattened properties
-        self.assertEquals(flatten_schema(nested_schema_with_properties, max_level=1),
+        self.assertEqual(flatten_schema(nested_schema_with_properties, max_level=1),
                           {
                               'c_pk': {'type': ['null', 'integer']},
                               'c_varchar': {'type': ['null', 'string']},
@@ -199,7 +233,7 @@ def test_flatten_schema(self):
 
         # FLATTENING - Schema with object type property but without further properties should be a dict with
         # flattened properties
-        self.assertEquals(flatten_schema(nested_schema_with_properties, max_level=10),
+        self.assertEqual(flatten_schema(nested_schema_with_properties, max_level=10),
                           {
                               'c_pk': {'type': ['null', 'integer']},
                               'c_varchar': {'type': ['null', 'string']},
@@ -216,11 +250,11 @@ def test_flatten_record(self):
 
         empty_record = {}
         # Empty record should be empty dict
-        self.assertEquals(flatten_record(empty_record), {})
+        self.assertEqual(flatten_record(empty_record), {})
 
         not_nested_record = {"c_pk": 1, "c_varchar": "1", "c_int": 1}
         # NO FLATTENING - Record with simple properties should be a plain dictionary
-        self.assertEquals(flatten_record(not_nested_record), not_nested_record)
+        self.assertEqual(flatten_record(not_nested_record), not_nested_record)
 
         nested_record = {
             "c_pk": 1,
@@ -235,7 +269,7 @@ def test_flatten_record(self):
                 }}}
 
         # NO FLATTENING - No flattening (default)
-        self.assertEquals(flatten_record(nested_record),
+        self.assertEqual(flatten_record(nested_record),
                           {
                               "c_pk": 1,
                               "c_varchar": "1",
@@ -246,7 +280,7 @@ def test_flatten_record(self):
 
         # NO FLATTENING
         #   max_level: 0 : No flattening (default)
-        self.assertEquals(flatten_record(nested_record, max_level=0),
+        self.assertEqual(flatten_record(nested_record, max_level=0),
                           {
                               "c_pk": 1,
                               "c_varchar": "1",
@@ -257,7 +291,7 @@ def test_flatten_record(self):
 
         # SEMI FLATTENING
         #   max_level: 1 : Semi-flattening (default)
-        self.assertEquals(flatten_record(nested_record, max_level=1),
+        self.assertEqual(flatten_record(nested_record, max_level=1),
                           {
                               "c_pk": 1,
                               "c_varchar": "1",
@@ -269,7 +303,7 @@ def test_flatten_record(self):
                           })
 
         # FLATTENING
-        self.assertEquals(flatten_record(nested_record, max_level=10),
+        self.assertEqual(flatten_record(nested_record, max_level=10),
                           {
                               "c_pk": 1,
                               "c_varchar": "1",