Trick Postgres into more efficiant use of index (#189)

transferwise · Sep 9, 2022 · 4c29e65 · 4c29e65
1 parent e7db01e
commit 4c29e65
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+1.8.4 (2022-09-08)
+-------------------
+**Changes**
+- INCREMENTAL: Use sub-query to trick PostreSQL into more efficient use of index.
+
 1.8.3 (2022-01-18)
 -------------------
 **Fixes**
@@ -48,7 +53,7 @@
 Fix data loss issue when running `LOG_BASED` due to the tap not sending new SCHEMA singer messages when source tables change structure, mainly new/renamed columns, which causes the target to not be up to date with the stream structure.
 The tap now:
 * Runs discovery for selected stream at the beginning of sync to send up to date SCHEMA singer messages
-* When new columns are detected in WAL payloads, then run discovery for the stream and send new SCHEMA message. 
+* When new columns are detected in WAL payloads, then run discovery for the stream and send new SCHEMA message.
 
 1.6.2 (2020-05-18)
 -------------------

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
     long_description = f.read()
 
 setup(name='pipelinewise-tap-postgres',
-      version='1.8.3',
+      version='1.8.4',
       description='Singer.io tap for extracting data from PostgresSQL - PipelineWise compatible',
       long_description=long_description,
       long_description_content_type='text/markdown',

diff --git a/tap_postgres/sync_strategies/incremental.py b/tap_postgres/sync_strategies/incremental.py
@@ -129,13 +129,17 @@ def _get_select_sql(params):
     stream = params['stream']
     if replication_key_value:
         select_sql = f"""
-    SELECT {','.join(escaped_columns)}
-    FROM {post_db.fully_qualified_table_name(schema_name, stream['table_name'])}
-    WHERE {post_db.prepare_columns_sql(replication_key)} >= '{replication_key_value}'::{replication_key_sql_datatype}
-    ORDER BY {post_db.prepare_columns_sql(replication_key)} ASC"""
+        SELECT {','.join(escaped_columns)}
+        FROM (
+            SELECT *
+            FROM {post_db.fully_qualified_table_name(schema_name, stream['table_name'])}
+            WHERE {post_db.prepare_columns_sql(replication_key)} >= '{replication_key_value}'::{replication_key_sql_datatype}
+            ORDER BY {post_db.prepare_columns_sql(replication_key)} ASC
+        ) pg_speedup_trick"""
     else:
         # if not replication_key_value
-        select_sql = f"""SELECT {','.join(escaped_columns)}
-                                    FROM {post_db.fully_qualified_table_name(schema_name, stream['table_name'])}
-                                    ORDER BY {post_db.prepare_columns_sql(replication_key)} ASC"""
+        select_sql = f"""
+        SELECT {','.join(escaped_columns)}
+        FROM {post_db.fully_qualified_table_name(schema_name, stream['table_name'])}
+        """
     return select_sql