diff --git a/meltano.yml b/meltano.yml index 64fa82a0..ac83a3a2 100644 --- a/meltano.yml +++ b/meltano.yml @@ -11,9 +11,11 @@ plugins: config: streams: - stream_name: animals - input_filename: https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/animals-data.jsonl + input_filename: + https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/animals-data.jsonl - stream_name: page_views - input_filename: https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/pageviews-data.jsonl + input_filename: + https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/pageviews-data.jsonl stream_maps: animals: __key_properties__: [id] @@ -30,13 +32,22 @@ plugins: - commits.url - commits.sha - commits.commit_timestamp + - name: tap-csv + variant: meltanolabs + pip_url: git+https://github.com/MeltanoLabs/tap-csv.git + config: + files: + - entity: data_target_postgres + path: $MELTANO_PROJECT_ROOT/performance/data.csv + keys: [column_1] + add_metadata_columns: false loaders: - name: target-postgres namespace: target_postgres pip_url: -e . settings: - name: sqlalchemy_url - kind: password + kind: string sensitive: true - name: ssl_enable kind: boolean @@ -46,16 +57,16 @@ plugins: sensitive: true - name: ssl_mode - name: ssl_certificate_authority - kind: password + kind: string sensitive: true - name: ssl_client_certificate - kind: password + kind: string sensitive: true - name: ssl_client_private_key - kind: password + kind: string sensitive: true - name: password - kind: password + kind: string sensitive: true - name: host - name: port @@ -72,6 +83,10 @@ plugins: password: postgres database: postgres target_schema: test + validate_records: false add_record_metadata: true + - name: target-postgres-copy-branch + inherit_from: target-postgres + pip_url: git+https://github.com/kinghuang/target-postgres@bulk-insert-copy environments: - name: dev diff --git a/podman.sh b/podman.sh new file mode 100755 index 00000000..e9666eee --- /dev/null +++ b/podman.sh @@ -0,0 +1,3 @@ +#/bin/bash +#Username postgres password postgres +podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres diff --git a/scripts/performance/.gitignore b/scripts/performance/.gitignore new file mode 100644 index 00000000..10f39b4d --- /dev/null +++ b/scripts/performance/.gitignore @@ -0,0 +1,2 @@ +data.csv +data.singer diff --git a/scripts/performance/1m_rows_generate.py b/scripts/performance/1m_rows_generate.py new file mode 100644 index 00000000..280aa3bf --- /dev/null +++ b/scripts/performance/1m_rows_generate.py @@ -0,0 +1,31 @@ +import csv +import random +import string + +num_rows = 1_000_000 +num_columns = 10 + + +# Generate random data for CSV +def random_string(length=10): + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) + + +# Generate the CSV file +csv_filename = "data.csv" + +with open(csv_filename, mode="w", newline="") as csv_file: + writer = csv.writer(csv_file) + + # Write header + header = [f"column_{i+1}" for i in range(num_columns)] + writer.writerow(header) + + # Write data rows + for _ in range(num_rows): + row = [random_string() for _ in range(num_columns)] + writer.writerow(row) + +print( + f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated." +) diff --git a/scripts/performance/README.md b/scripts/performance/README.md new file mode 100644 index 00000000..349bdaa0 --- /dev/null +++ b/scripts/performance/README.md @@ -0,0 +1,36 @@ +# target-postgres Performance Analysis + +Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's. + +Main points: +1. We need something to comapre to. For postgres we have native import commands that are well optimized. We will use this as a baseline. +1. Relative speed is the metric to focus on. If we focus on absolute speed then there's a bunch of hardware consideration that we are not trying to solve here (Would need to consider how paralleization fits into the mix here if we go there) + +# Why do this work? +1. Without it we are guessing at what can help improve performance, this gives us a more objective way to pick what we should focus on + +# How to run +1. `./prep.sh` gets the data together for you in the right place +2. `python speed_compare.py` runs all the tests and gives you the times for each test + +# Results for 1 million records +| **Test Name** | **Total Run Time (s)** | **x Slower Than Native Copy** | +|-------------------------------------------------------------|------------------------|-------------------------------| +| `./perf_tests/pg_copy_upsert.sh` | 13.64 | 1.0000 | +| `./perf_tests/target_postgres_copy_branch_no_validate.sh` | 100.50 | 7.3697 | +| `./perf_tests/target_postgres_current_branch_no_validate.sh`| 141.48 | 10.3749 | +| `./perf_tests/target_postgres_copy_branch.sh` | 265.53 | 19.4719 | +| `./perf_tests/target_postgres_current_branch.sh` | 298.37 | 21.8799 | + +# Other questions / concerns +1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well +1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records + +# Next steps to improve performance +Next steps to improve peformance: +- [ ] Split the current [Bulk Insert Speed PR](https://github.com/MeltanoLabs/target-postgres/pull/370) to be a seperate sink that can be turned on with a configuration setting +- [ ] Test the new sink with the same tests as the main sink and add failures for the one's we know do not pass +- [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation. +- [ ] Evaluate why we're not closer to native copy speeds. Within 50% of native speeds seems reasonable but that's just a guess +- [ ] Add [pg_loader](https://github.com/dimitri/pgloader) with multiple threads, no reason we couldn't do something similar in targets +- [ ] Add a CI job that calculates performance implications of PR for every run diff --git a/scripts/performance/meltano.yml b/scripts/performance/meltano.yml new file mode 100644 index 00000000..c25b2b5c --- /dev/null +++ b/scripts/performance/meltano.yml @@ -0,0 +1,67 @@ +version: 1 +send_anonymous_usage_stats: true +default_environment: dev +project_id: target-postgres +plugins: + extractors: + - name: tap-csv + variant: meltanolabs + pip_url: git+https://github.com/MeltanoLabs/tap-csv.git + config: + files: + - entity: data_target_postgres + path: $MELTANO_PROJECT_ROOT/data.csv + keys: [column_1] + add_metadata_columns: false + loaders: + - name: target-postgres + namespace: target_postgres + pip_url: -e ../../. + settings: + - name: sqlalchemy_url + kind: string + sensitive: true + - name: ssl_enable + kind: boolean + sensitive: true + - name: ssl_client_certificate_enable + kind: boolean + sensitive: true + - name: ssl_mode + - name: ssl_certificate_authority + kind: string + sensitive: true + - name: ssl_client_certificate + kind: string + sensitive: true + - name: ssl_client_private_key + kind: string + sensitive: true + - name: password + kind: string + sensitive: true + - name: host + - name: port + kind: integer + - name: user + - name: database + - name: target_schema + - name: add_record_metadata + kind: boolean + - name: validate_records + kind: boolean + - name: batch_size_rows + kind: integer + config: + host: localhost + port: 5432 + user: postgres + password: postgres + database: postgres + target_schema: test + add_record_metadata: true + - name: target-postgres-copy-branch + inherit_from: target-postgres + pip_url: git+https://github.com/kinghuang/target-postgres@bulk-insert-copy +environments: +- name: dev diff --git a/scripts/performance/perf_tests/pg_copy_upsert.sh b/scripts/performance/perf_tests/pg_copy_upsert.sh new file mode 100755 index 00000000..a45d8cf5 --- /dev/null +++ b/scripts/performance/perf_tests/pg_copy_upsert.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Variables +CSV_FILE="data.csv" +DB_NAME="postgres" +DB_USER="postgres" +DB_PASSWORD="postgres" +DB_HOST="localhost" +DB_PORT="5432" + +# Export the password to avoid being prompted +export PGPASSWORD=$DB_PASSWORD + +# Execute COPY command to import the CSV into PostgreSQL +#psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c "\COPY large_data FROM '$CSV_FILE' CSV HEADER;" +# Begin transaction +psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME <\",\n \"path\": \"\",\n \"keys\": [\"\"],\n },\n // ...\n]\n```\n", + "placeholder": "Ex. files-def.json" + }, + { + "name": "faker_config.locale", + "kind": "array", + "label": "Faker Config Locale", + "description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization" + }, + { + "name": "faker_config.seed", + "kind": "string", + "label": "Faker Config Seed", + "description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator" + }, + { + "name": "files", + "kind": "array", + "label": "Files", + "description": "Array of objects with `entity`, `path`, `keys`, and `encoding` [Optional] keys:\n\n* `entity`: The entity name, used as the table name for the data loaded from that CSV.\n* `path`: Local path (relative to the project's root) to the file to be ingested. Note that this may be a directory, in which case all files in that directory and any of its subdirectories will be recursively processed\n* `keys`: The names of the columns that constitute the unique keys for that entity.\n* `encoding`: [Optional] The file encoding to use when reading the file (i.e. \"latin1\", \"UTF-8\"). Use this setting when you get a UnicodeDecodeError error.\n Each input CSV file must be a traditionally-delimited CSV (comma separated columns, newlines indicate new rows, double quoted values).\n\nThe following entries are passed through in an internal CSV dialect that then is used to configure the CSV reader:\n\n* `delimiter`: A one-character string used to separate fields. It defaults to ','.\n* `doublequote`: Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. It defaults to True.\n* `escapechar`: A one-character string used by the reader, where the escapechar removes any special meaning from the following character. It defaults to None, which disables escaping.\n* `quotechar`: A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '\"'.\n* `skipinitialspace`: When True, spaces immediately following the delimiter are ignored. The default is False.\n* `strict`: When True, raise exception Error on bad CSV input. The default is False.\n\nThe first row is the header defining the attribute name for that column and will result to a column of the same name in the database. It must have a valid format with no spaces or special characters (like for example `!` or `@`, etc).\n" + }, + { + "name": "flattening_enabled", + "kind": "boolean", + "label": "Flattening Enabled", + "description": "'True' to enable schema flattening and automatically expand nested properties." + }, + { + "name": "flattening_max_depth", + "kind": "integer", + "label": "Flattening Max Depth", + "description": "The max depth to flatten schemas." + }, + { + "name": "stream_map_config", + "kind": "object", + "label": "Stream Map Config", + "description": "User-defined config values to be used within map expressions." + }, + { + "name": "stream_maps", + "kind": "object", + "label": "Stream Maps", + "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." + } + ] +} diff --git a/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock new file mode 100644 index 00000000..da622e8f --- /dev/null +++ b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock @@ -0,0 +1,177 @@ +{ + "plugin_type": "extractors", + "name": "tap-github", + "namespace": "tap_github", + "variant": "meltanolabs", + "label": "GitHub", + "docs": "https://hub.meltano.com/extractors/tap-github--meltanolabs", + "repo": "https://github.com/MeltanoLabs/tap-github", + "pip_url": "git+https://github.com/MeltanoLabs/tap-github.git", + "description": "Code hosting platform", + "logo_url": "https://hub.meltano.com/assets/logos/extractors/github.png", + "capabilities": [ + "about", + "batch", + "catalog", + "discover", + "schema-flattening", + "state", + "stream-maps" + ], + "settings_group_validation": [ + [ + "repositories" + ], + [ + "organizations" + ], + [ + "searches" + ], + [ + "user_usernames" + ], + [ + "user_ids" + ] + ], + "settings": [ + { + "name": "additional_auth_tokens", + "kind": "array", + "label": "Additional Auth Tokens", + "description": "List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits." + }, + { + "name": "auth_token", + "kind": "string", + "label": "Auth Token", + "description": "GitHub token to authenticate with.", + "sensitive": true + }, + { + "name": "batch_config.encoding.compression", + "kind": "options", + "label": "Batch Config Encoding Compression", + "description": "Compression format to use for batch files.", + "options": [ + { + "label": "Gzip", + "value": "gzip" + }, + { + "label": "None", + "value": "none" + } + ] + }, + { + "name": "batch_config.encoding.format", + "kind": "options", + "label": "Batch Config Encoding Format", + "description": "Format to use for batch files.", + "options": [ + { + "label": "Jsonl", + "value": "jsonl" + } + ] + }, + { + "name": "batch_config.storage.prefix", + "kind": "string", + "label": "Batch Config Storage Prefix", + "description": "Prefix to use when writing batch files." + }, + { + "name": "batch_config.storage.root", + "kind": "string", + "label": "Batch Config Storage Root", + "description": "Root path to use when writing batch files." + }, + { + "name": "flattening_enabled", + "kind": "boolean", + "label": "Flattening Enabled", + "description": "'True' to enable schema flattening and automatically expand nested properties." + }, + { + "name": "flattening_max_depth", + "kind": "integer", + "label": "Flattening Max Depth", + "description": "The max depth to flatten schemas." + }, + { + "name": "metrics_log_level", + "kind": "string", + "label": "Metrics Log Level", + "description": "The log level of the API response metrics." + }, + { + "name": "organizations", + "kind": "array", + "label": "Organizations", + "description": "An array of strings containing the github organizations to be included" + }, + { + "name": "rate_limit_buffer", + "kind": "integer", + "label": "Rate Limit Buffer", + "description": "Add a buffer to avoid consuming all query points for the token at hand. Defaults to 1000." + }, + { + "name": "repositories", + "kind": "array", + "label": "Repositories", + "description": "An array of strings containing the github repos to be included" + }, + { + "name": "searches", + "kind": "array", + "label": "Searches", + "description": "An array of search descriptor objects with the following properties. \"name\" - a human readable name for the search query. \"query\" - a github search string (generally the same as would come after ?q= in the URL)" + }, + { + "name": "skip_parent_streams", + "kind": "boolean", + "label": "Skip Parent Streams", + "description": "Set to true to skip API calls for the parent streams (such as repositories) if it is not selected but children are" + }, + { + "name": "start_date", + "kind": "date_iso8601", + "label": "Start Date" + }, + { + "name": "stream_map_config", + "kind": "object", + "label": "Stream Map Config" + }, + { + "name": "stream_maps", + "kind": "object", + "label": "Stream Maps" + }, + { + "name": "user_agent", + "kind": "string", + "label": "User Agent" + }, + { + "name": "user_ids", + "kind": "array", + "label": "User IDs", + "description": "A list of GitHub user ids." + }, + { + "name": "user_usernames", + "kind": "array", + "label": "User Usernames", + "description": "A list of GithHub usernames." + } + ], + "select": [ + "*.*", + "!traffic_*.*" + ] +} diff --git a/scripts/performance/prep.sh b/scripts/performance/prep.sh new file mode 100755 index 00000000..9d6303c1 --- /dev/null +++ b/scripts/performance/prep.sh @@ -0,0 +1,34 @@ +#!/bin/bash +time python 1m_rows_generate.py +time meltano invoke tap-csv > data.singer + +# Create initial table in postgres + +#Spin up postgres instance +podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres + +#Vars We'd definietly want this as a meltano utility, just as POC right now +DB_NAME="postgres" +DB_USER="postgres" +DB_PASSWORD="postgres" +DB_HOST="localhost" +DB_PORT="5432" +export PGPASSWORD=$DB_PASSWORD + +psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME <