From 6db9761a46d66ba3c98f62d65108d3dde43d2bfc Mon Sep 17 00:00:00 2001 From: Neil McCallum Date: Mon, 12 Feb 2024 10:25:15 +1300 Subject: [PATCH 1/2] tests and proposed fix for tab char data corruption --- target_snowflake/file_formats/csv.py | 2 +- tests/unit/file_formats/test_csv.py | 6 +++++- tests/unit/file_formats/test_parquet.py | 6 +++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/target_snowflake/file_formats/csv.py b/target_snowflake/file_formats/csv.py index 42d76ee4..a0748808 100644 --- a/target_snowflake/file_formats/csv.py +++ b/target_snowflake/file_formats/csv.py @@ -63,7 +63,7 @@ def record_to_csv_line(record: dict, return ','.join( [ - json.dumps(flatten_record[column], ensure_ascii=False) if column in flatten_record and ( + '"{}"'.format(flatten_record[column].replace('"','""')) if column in flatten_record and ( flatten_record[column] == 0 or flatten_record[column]) else '' for column in schema ] diff --git a/tests/unit/file_formats/test_csv.py b/tests/unit/file_formats/test_csv.py index 900ba9f1..ed45b8db 100644 --- a/tests/unit/file_formats/test_csv.py +++ b/tests/unit/file_formats/test_csv.py @@ -62,6 +62,7 @@ def test_record_to_csv_line(self): 'key4': '25:01:01', 'key5': 'I\'m good', 'key6': None, + 'key7': 'tab\tin\tvalue\n\r\0"ὠ', } schema = { @@ -89,10 +90,13 @@ def test_record_to_csv_line(self): 'key6': { 'type': ['null', 'string'], 'format': 'time', }, + 'key7':{ + 'type':['null', 'string'], + }, } self.assertEqual(csv.record_to_csv_line(record, schema), - '"1","2030-01-22","10000-01-22 12:04:22","25:01:01","I\'m good",') + '"1","2030-01-22","10000-01-22 12:04:22","25:01:01","I\'m good",,"tab\tin\tvalue\n\r\0""ὠ"') def test_create_copy_sql(self): self.assertEqual(csv.create_copy_sql(table_name='foo_table', diff --git a/tests/unit/file_formats/test_parquet.py b/tests/unit/file_formats/test_parquet.py index c20aa44e..0476b20c 100644 --- a/tests/unit/file_formats/test_parquet.py +++ b/tests/unit/file_formats/test_parquet.py @@ -21,6 +21,7 @@ def test_records_to_dataframe(self): 'key4': '12:01:01', 'key5': 'I\'m good', 'key6': None, + 'key7': 'A tab is a char too', }, '2': { 'key1': 2, @@ -29,6 +30,7 @@ def test_records_to_dataframe(self): 'key4': '13:01:01', 'key5': 'I\'m good too', 'key6': None, + 'key7': 'A\tis a char too', }, '3': { 'key1': 3, @@ -37,6 +39,7 @@ def test_records_to_dataframe(self): 'key4': '14:01:01', 'key5': 'I want to be good', 'key6': None, + 'key7': 'A\t\tis a char too', } } @@ -48,7 +51,8 @@ def test_records_to_dataframe(self): 'key3': ['10000-01-22 12:04:22', '10000-01-22 12:04:22', '10000-01-22 12:04:22'], 'key4': ['12:01:01', '13:01:01', '14:01:01'], 'key5': ['I\'m good', 'I\'m good too', 'I want to be good'], - 'key6': [None, None, None]})) + 'key6': [None, None, None], + 'key7': ['A tab is a char too', 'A\tis a char too', 'A\t\tis a char too'] })) def test_create_copy_sql(self): self.assertEqual(parquet.create_copy_sql(table_name='foo_table', From d0577570c2c84ca693bba3efedd60df91d42dc2c Mon Sep 17 00:00:00 2001 From: Neil McCallum Date: Wed, 14 Feb 2024 15:20:00 +1300 Subject: [PATCH 2/2] explicit str cast for bool etc --- target_snowflake/file_formats/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target_snowflake/file_formats/csv.py b/target_snowflake/file_formats/csv.py index a0748808..02a67ecf 100644 --- a/target_snowflake/file_formats/csv.py +++ b/target_snowflake/file_formats/csv.py @@ -63,7 +63,7 @@ def record_to_csv_line(record: dict, return ','.join( [ - '"{}"'.format(flatten_record[column].replace('"','""')) if column in flatten_record and ( + '"{}"'.format(str(flatten_record[column]).replace('"','""')) if column in flatten_record and ( flatten_record[column] == 0 or flatten_record[column]) else '' for column in schema ]