From 9e694b68e5bbff2f803822150b60d3aec7daa72d Mon Sep 17 00:00:00 2001 From: Tanuj Nayak Date: Tue, 11 Feb 2025 14:34:39 -0800 Subject: [PATCH] [#25990] YSQL: Upgrade to code from PgVector 0.8.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This diff updates the `src/postgres/third-party-extensions/pgvector/` folder to include new functions and datatypes from upstream pgvector 0.8.0 (2627c5ff775ae6d7aef0c430121ccf857842d2f2). This change brings in the following new features: - New Datatypes -- `sparsevec` -- `halfvec` -- Input/output/send/receive/typmod_in functions for the above - New Functions -- `l1_distance` -- `l2_normalize` -- `hamming_distance` -- `jaccard_distance` -- `binary_quantize` -- `subvector` -- operators for the above distance functions -- Common aggregates such as sum/avg for the new datatypes This change merges code for new functions and datatypes from upstream into its code files. Code files relating to ivfflat have also been deleted from this change as we do not intend on repurposing any of pgvector’s index access methods. For this reason, any upstream code relating to hnsw and ivfflat have not been merged. Also, while upstream pgvector stores all the new vector types (`vector`, `sparsevec`, `halfvec`) in external (TOASTed) format, this change keeps those types in extended storage format since YB doesn't TOAST any values. The test folder has been updated to match pgvector's test folder with the exception of files related to ivfflat and hnsw. Similarly, perl test files were cleaned out from the `src/postgres/third-party-extensions/pgvector/test/t` directory as they are not used by YB. `copy.sql` and `copy.out` were slightly modified to copy to and from tables in the `/tmp/` directory to be compatible with our build system. Hence, these files are ported as `yb.port.copy.sql` and `yb.port.copy.out`. Users on an older version of the vector extension must recreate this extension to get onto the newer version. Our initial port of pgvector ported the vector datatype as-is. Later, we modified the vector datatype to contain an embedded vector id in the form of a UUID. An upgrade path to 0.8.0-yb-1.0 is omitted to force users to use this newer version of the vector datatype. From 0.8.0-yb-1.0 going forward, we want to prevent users using the old vector instances which did not have any associated vector ids. This still means though that a followup change has to be made to force users to drop any old version of their vector extension even if they don't choose to try to upgrade to 0.8.0-yb-1.0. Extension creation scripts for older versions of vector have been removed to prevent users from creating these old versions of the extension. **Note: Upstream pgvector's changes merged into this change removed PG12-related ifdefs as PG12 is EOL. That means that this change cannot be backported to 2024.2 and below.** Test Plan: ./yb_build.sh --java-test 'org.yb.pgsql.TestPgRegressThirdPartyPgvector' Reviewers: kramanathan, jason Reviewed By: jason Subscribers: mihnea, jason, yql Differential Revision: https://phorge.dev.yugabyte.com/D41864 --- .../src/test/regress/expected/copy.out | 293 +--- .../pgvector/CHANGELOG.md | 92 +- .../pgvector/Dockerfile | 4 +- .../third-party-extensions/pgvector/LICENSE | 2 +- .../third-party-extensions/pgvector/META.json | 6 +- .../third-party-extensions/pgvector/Makefile | 25 +- .../pgvector/Makefile.win | 31 +- .../third-party-extensions/pgvector/README.md | 969 +++++++++++-- .../vector--0.4.4-yb-1.0--0.4.4-yb-1.1.sql | 20 - .../pgvector/sql/vector--0.4.4-yb-1.0.sql | 210 --- .../vector--0.4.4-yb-1.1--0.4.4-yb-1.2.sql | 13 - .../pgvector/sql/vector.sql | 603 +++++++- .../pgvector/src/bitutils.c | 222 +++ .../pgvector/src/bitutils.h | 16 + .../pgvector/src/bitvec.c | 69 + .../pgvector/src/bitvec.h | 8 + .../pgvector/src/halfutils.c | 298 ++++ .../pgvector/src/halfutils.h | 263 ++++ .../pgvector/src/halfvec.c | 1189 ++++++++++++++++ .../pgvector/src/halfvec.h | 70 + .../pgvector/src/ivfbuild.c | 666 --------- .../pgvector/src/ivfflat.c | 268 ---- .../pgvector/src/ivfflat.h | 254 ---- .../pgvector/src/ivfinsert.c | 217 --- .../pgvector/src/ivfkmeans.c | 536 ------- .../pgvector/src/ivfscan.c | 364 ----- .../pgvector/src/ivfutils.c | 227 --- .../pgvector/src/ivfvacuum.c | 161 --- .../pgvector/src/sparsevec.c | 1256 +++++++++++++++++ .../pgvector/src/sparsevec.h | 40 + .../pgvector/src/vector.c | 741 ++++++---- .../pgvector/src/vector.h | 31 +- .../pgvector/test/expected/bit.out | 140 ++ .../pgvector/test/expected/btree.out | 46 +- .../pgvector/test/expected/cast.out | 228 ++- .../pgvector/test/expected/copy.out | 39 +- .../pgvector/test/expected/functions.out | 110 -- .../pgvector/test/expected/halfvec.out | 636 +++++++++ .../pgvector/test/expected/input.out | 124 -- .../pgvector/test/expected/ivfflat_cosine.out | 19 - .../pgvector/test/expected/ivfflat_ip.out | 20 - .../pgvector/test/expected/ivfflat_l2.out | 26 - .../test/expected/ivfflat_options.out | 15 - .../test/expected/ivfflat_unlogged.out | 13 - .../pgvector/test/expected/sparsevec.out | 653 +++++++++ .../pgvector/test/expected/vector_type.out | 672 +++++++++ .../pgvector/test/expected/yb.orig.setup.out | 2 +- .../pgvector/test/expected/yb.port.copy.out | 51 + .../pgvector/test/perl/PostgresNode.pm | 8 - .../pgvector/test/perl/TestLib.pm | 3 - .../pgvector/test/sql/bit.sql | 27 + .../pgvector/test/sql/btree.sql | 26 +- .../pgvector/test/sql/cast.sql | 67 +- .../pgvector/test/sql/copy.sql | 36 +- .../pgvector/test/sql/functions.sql | 29 - .../pgvector/test/sql/halfvec.sql | 147 ++ .../pgvector/test/sql/input.sql | 27 - .../pgvector/test/sql/ivfflat_cosine.sql | 12 - .../pgvector/test/sql/ivfflat_ip.sql | 12 - .../pgvector/test/sql/ivfflat_l2.sql | 13 - .../pgvector/test/sql/ivfflat_options.sql | 9 - .../pgvector/test/sql/ivfflat_unlogged.sql | 9 - .../pgvector/test/sql/sparsevec.sql | 134 ++ .../pgvector/test/sql/vector_type.sql | 154 ++ .../pgvector/test/sql/yb.orig.setup.sql | 2 +- .../pgvector/test/sql/yb.port.copy.sql | 44 + .../pgvector/test/t/001_wal.pl | 97 -- .../pgvector/test/t/002_vacuum.pl | 41 - .../pgvector/test/t/003_recall.pl | 88 -- .../pgvector/test/t/004_centers.pl | 36 - .../pgvector/test/t/005_query_recall.pl | 45 - .../pgvector/test/t/006_lists.pl | 31 - .../pgvector/test/t/007_inserts.pl | 55 - .../pgvector/test/t/008_avg.pl | 35 - .../pgvector/test/t/009_storage.pl | 32 - .../pgvector/test/yb_schedule | 7 +- .../pgvector/vector.control | 4 +- 77 files changed, 8667 insertions(+), 4521 deletions(-) delete mode 100644 src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.0--0.4.4-yb-1.1.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.0.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.1--0.4.4-yb-1.2.sql create mode 100644 src/postgres/third-party-extensions/pgvector/src/bitutils.c create mode 100644 src/postgres/third-party-extensions/pgvector/src/bitutils.h create mode 100644 src/postgres/third-party-extensions/pgvector/src/bitvec.c create mode 100644 src/postgres/third-party-extensions/pgvector/src/bitvec.h create mode 100644 src/postgres/third-party-extensions/pgvector/src/halfutils.c create mode 100644 src/postgres/third-party-extensions/pgvector/src/halfutils.h create mode 100644 src/postgres/third-party-extensions/pgvector/src/halfvec.c create mode 100644 src/postgres/third-party-extensions/pgvector/src/halfvec.h delete mode 100644 src/postgres/third-party-extensions/pgvector/src/ivfbuild.c delete mode 100644 src/postgres/third-party-extensions/pgvector/src/ivfflat.c delete mode 100644 src/postgres/third-party-extensions/pgvector/src/ivfflat.h delete mode 100644 src/postgres/third-party-extensions/pgvector/src/ivfinsert.c delete mode 100644 src/postgres/third-party-extensions/pgvector/src/ivfkmeans.c delete mode 100644 src/postgres/third-party-extensions/pgvector/src/ivfscan.c delete mode 100644 src/postgres/third-party-extensions/pgvector/src/ivfutils.c delete mode 100644 src/postgres/third-party-extensions/pgvector/src/ivfvacuum.c create mode 100644 src/postgres/third-party-extensions/pgvector/src/sparsevec.c create mode 100644 src/postgres/third-party-extensions/pgvector/src/sparsevec.h create mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/bit.out delete mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/functions.out create mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/halfvec.out delete mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/input.out delete mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_cosine.out delete mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_ip.out delete mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_l2.out delete mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_options.out delete mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_unlogged.out create mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/sparsevec.out create mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/vector_type.out create mode 100644 src/postgres/third-party-extensions/pgvector/test/expected/yb.port.copy.out delete mode 100644 src/postgres/third-party-extensions/pgvector/test/perl/PostgresNode.pm delete mode 100644 src/postgres/third-party-extensions/pgvector/test/perl/TestLib.pm create mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/bit.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/functions.sql create mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/halfvec.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/input.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_cosine.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_ip.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_l2.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_options.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_unlogged.sql create mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/sparsevec.sql create mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/vector_type.sql create mode 100644 src/postgres/third-party-extensions/pgvector/test/sql/yb.port.copy.sql delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/001_wal.pl delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/002_vacuum.pl delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/003_recall.pl delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/004_centers.pl delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/005_query_recall.pl delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/006_lists.pl delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/007_inserts.pl delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/008_avg.pl delete mode 100644 src/postgres/third-party-extensions/pgvector/test/t/009_storage.pl diff --git a/src/postgres/src/test/regress/expected/copy.out b/src/postgres/src/test/regress/expected/copy.out index 8a8bf43fdea0..9b4ebc088b79 100644 --- a/src/postgres/src/test/regress/expected/copy.out +++ b/src/postgres/src/test/regress/expected/copy.out @@ -1,242 +1,51 @@ --- --- COPY --- --- directory paths are passed to us in environment variables -\getenv abs_srcdir PG_ABS_SRCDIR -\getenv abs_builddir PG_ABS_BUILDDIR ---- test copying in CSV mode with various styles ---- of embedded line ending characters -create temp table copytest ( - style text, - test text, - filler int); -insert into copytest values('DOS',E'abc\r\ndef',1); -insert into copytest values('Unix',E'abc\ndef',2); -insert into copytest values('Mac',E'abc\rdef',3); -insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4); -\set filename :abs_builddir '/results/copytest.csv' -copy copytest to :'filename' csv; -create temp table copytest2 (like copytest); -copy copytest2 from :'filename' csv; -select * from copytest except select * from copytest2; - style | test | filler --------+------+-------- -(0 rows) - -truncate copytest2; ---- same test but with an escape char different from quote char -copy copytest to :'filename' csv quote '''' escape E'\\'; -copy copytest2 from :'filename' csv quote '''' escape E'\\'; -select * from copytest except select * from copytest2; - style | test | filler --------+------+-------- -(0 rows) - --- test header line feature -create temp table copytest3 ( - c1 int, - "col with , comma" text, - "col with "" quote" int); -copy copytest3 from stdin csv header; -copy copytest3 to stdout csv header; -c1,"col with , comma","col with "" quote" -1,a,1 -2,b,2 -create temp table copytest4 ( - c1 int, - "colname with tab: " text); -copy copytest4 from stdin (header); -copy copytest4 to stdout (header); -c1 colname with tab: \t -1 a -2 b --- test copy from with a partitioned table -create table parted_copytest ( - a int, - b int, - c text -) partition by list (b); -create table parted_copytest_a1 (c text, b int, a int); -create table parted_copytest_a2 (a int, c text, b int); -alter table parted_copytest attach partition parted_copytest_a1 for values in(1); -alter table parted_copytest attach partition parted_copytest_a2 for values in(2); --- We must insert enough rows to trigger multi-inserts. These are only --- enabled adaptively when there are few enough partition changes. -insert into parted_copytest select x,1,'One' from generate_series(1,1000) x; -insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x; -insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x; -\set filename :abs_builddir '/results/parted_copytest.csv' -copy (select * from parted_copytest order by a) to :'filename'; -truncate parted_copytest; -copy parted_copytest from :'filename'; --- Ensure COPY FREEZE errors for partitioned tables. -begin; -truncate parted_copytest; -copy parted_copytest from :'filename' (freeze); -ERROR: cannot perform COPY FREEZE on a partitioned table -rollback; -select tableoid::regclass,count(*),sum(a) from parted_copytest -group by tableoid order by tableoid::regclass::name; - tableoid | count | sum ---------------------+-------+-------- - parted_copytest_a1 | 1010 | 510655 - parted_copytest_a2 | 10 | 10055 -(2 rows) - -truncate parted_copytest; --- create before insert row trigger on parted_copytest_a2 -create function part_ins_func() returns trigger language plpgsql as $$ -begin - return new; -end; -$$; -create trigger part_ins_trig - before insert on parted_copytest_a2 - for each row - execute procedure part_ins_func(); -copy parted_copytest from :'filename'; -select tableoid::regclass,count(*),sum(a) from parted_copytest -group by tableoid order by tableoid::regclass::name; - tableoid | count | sum ---------------------+-------+-------- - parted_copytest_a1 | 1010 | 510655 - parted_copytest_a2 | 10 | 10055 -(2 rows) - -truncate table parted_copytest; -create index on parted_copytest (b); -drop trigger part_ins_trig on parted_copytest_a2; -copy parted_copytest from stdin; --- Ensure index entries were properly added during the copy. -select * from parted_copytest where b = 1; - a | b | c ----+---+------ - 1 | 1 | str1 -(1 row) - -select * from parted_copytest where b = 2; - a | b | c ----+---+------ - 2 | 2 | str2 -(1 row) - -drop table parted_copytest; --- --- Progress reporting for COPY --- -create table tab_progress_reporting ( - name text, - age int4, - location point, - salary int4, - manager name -); --- Add a trigger to catch and print the contents of the catalog view --- pg_stat_progress_copy during data insertion. This allows to test --- the validation of some progress reports for COPY FROM where the trigger --- would fire. -create function notice_after_tab_progress_reporting() returns trigger AS -$$ -declare report record; -begin - -- The fields ignored here are the ones that may not remain - -- consistent across multiple runs. The sizes reported may differ - -- across platforms, so just check if these are strictly positive. - with progress_data as ( - select - relid::regclass::text as relname, - command, - type, - bytes_processed > 0 as has_bytes_processed, - bytes_total > 0 as has_bytes_total, - tuples_processed, - tuples_excluded - from pg_stat_progress_copy - where pid = pg_backend_pid()) - select into report (to_jsonb(r)) as value - from progress_data r; - - raise info 'progress: %', report.value::text; - return new; -end; -$$ language plpgsql; -create trigger check_after_tab_progress_reporting - after insert on tab_progress_reporting - for each statement - execute function notice_after_tab_progress_reporting(); --- Generate COPY FROM report with PIPE. -copy tab_progress_reporting from stdin; -INFO: progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": false, "tuples_excluded": 0, "tuples_processed": 3, "has_bytes_processed": true} --- Generate COPY FROM report with FILE, with some excluded tuples. -truncate tab_progress_reporting; -\set filename :abs_srcdir '/data/emp.data' -copy tab_progress_reporting from :'filename' - where (salary < 2000); -INFO: progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": true, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true} -drop trigger check_after_tab_progress_reporting on tab_progress_reporting; -drop function notice_after_tab_progress_reporting(); -drop table tab_progress_reporting; --- Test header matching feature -create table header_copytest ( - a int, - b int, - c text -); --- Make sure it works with dropped columns -alter table header_copytest drop column c; -alter table header_copytest add column c text; -copy header_copytest to stdout with (header match); -ERROR: cannot use "match" with HEADER in COPY TO -copy header_copytest from stdin with (header wrong_choice); -ERROR: header requires a Boolean value or "match" --- works -copy header_copytest from stdin with (header match); -copy header_copytest (c, a, b) from stdin with (header match); -copy header_copytest from stdin with (header match, format csv); --- errors -copy header_copytest (c, b, a) from stdin with (header match); -ERROR: column name mismatch in header line field 1: got "a", expected "c" -CONTEXT: COPY header_copytest, line 1: "a b c" -copy header_copytest from stdin with (header match); -ERROR: column name mismatch in header line field 3: got null value ("\N"), expected "c" -CONTEXT: COPY header_copytest, line 1: "a b \N" -copy header_copytest from stdin with (header match); -ERROR: wrong number of fields in header line: got 2, expected 3 -CONTEXT: COPY header_copytest, line 1: "a b" -copy header_copytest from stdin with (header match); -ERROR: wrong number of fields in header line: got 4, expected 3 -CONTEXT: COPY header_copytest, line 1: "a b c d" -copy header_copytest from stdin with (header match); -ERROR: column name mismatch in header line field 3: got "d", expected "c" -CONTEXT: COPY header_copytest, line 1: "a b d" -SELECT * FROM header_copytest ORDER BY a; - a | b | c ----+---+----- - 1 | 2 | foo - 3 | 4 | bar - 5 | 6 | baz -(3 rows) - --- Drop an extra column, in the middle of the existing set. -alter table header_copytest drop column b; --- works -copy header_copytest (c, a) from stdin with (header match); -copy header_copytest (a, c) from stdin with (header match); --- errors -copy header_copytest from stdin with (header match); -ERROR: wrong number of fields in header line: got 3, expected 2 -CONTEXT: COPY header_copytest, line 1: "a ........pg.dropped.2........ c" -copy header_copytest (a, c) from stdin with (header match); -ERROR: wrong number of fields in header line: got 3, expected 2 -CONTEXT: COPY header_copytest, line 1: "a c b" -SELECT * FROM header_copytest ORDER BY a; - a | c ----+----- - 1 | foo - 3 | bar - 5 | baz - 7 | foo - 8 | foo -(5 rows) - -drop table header_copytest; +-- vector +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE TABLE t2 (val vector(3)); +\copy t TO 'results/vector.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/vector.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +--------- + [0,0,0] + [1,1,1] + [1,2,3] + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; +-- halfvec +CREATE TABLE t (val halfvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE TABLE t2 (val halfvec(3)); +\copy t TO 'results/halfvec.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/halfvec.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +--------- + [0,0,0] + [1,1,1] + [1,2,3] + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; +-- sparsevec +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{1:1,2:2,3:3}/3'), ('{1:1,2:1,3:1}/3'), (NULL); +CREATE TABLE t2 (val sparsevec(3)); +\copy t TO 'results/sparsevec.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/sparsevec.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +----------------- + {}/3 + {1:1,2:1,3:1}/3 + {1:1,2:2,3:3}/3 + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; diff --git a/src/postgres/third-party-extensions/pgvector/CHANGELOG.md b/src/postgres/third-party-extensions/pgvector/CHANGELOG.md index 794416670440..757f998d0053 100644 --- a/src/postgres/third-party-extensions/pgvector/CHANGELOG.md +++ b/src/postgres/third-party-extensions/pgvector/CHANGELOG.md @@ -1,3 +1,91 @@ +## 0.8.0 (2024-10-30) + +- Added support for iterative index scans +- Added casts for arrays to `sparsevec` +- Improved cost estimation for better index selection when filtering +- Improved performance of HNSW index scans +- Improved performance of HNSW inserts and on-disk index builds +- Dropped support for Postgres 12 + +## 0.7.4 (2024-08-05) + +- Fixed locking for parallel HNSW index builds +- Fixed compilation error with GCC 14 on i386 when SSE2 is not enabled + +## 0.7.3 (2024-07-22) + +- Fixed `failed to add index item` error with `sparsevec` +- Fixed compilation error with FreeBSD ARM +- Fixed compilation warning with MSVC and Postgres 16 + +## 0.7.2 (2024-06-11) + +- Fixed initialization fork for indexes on unlogged tables + +## 0.7.1 (2024-06-03) + +- Improved performance of on-disk HNSW index builds +- Fixed `undefined symbol` error with GCC 8 +- Fixed compilation error with universal binaries on Mac +- Fixed compilation warning with Clang < 14 + +## 0.7.0 (2024-04-29) + +- Added `halfvec` type +- Added `sparsevec` type +- Added support for indexing `bit` type +- Added support for indexing L1 distance with HNSW +- Added `binary_quantize` function +- Added `hamming_distance` function +- Added `jaccard_distance` function +- Added `l2_normalize` function +- Added `subvector` function +- Added concatenate operator for vectors +- Added CPU dispatching for distance functions on Linux x86-64 +- Updated comparison operators to support vectors with different dimensions + +## 0.6.2 (2024-03-18) + +- Reduced lock contention with parallel HNSW index builds + +## 0.6.1 (2024-03-04) + +- Fixed error with `ANALYZE` and vectors with different dimensions +- Fixed segmentation fault with `shared_preload_libraries` +- Fixed vector subtraction being marked as commutative + +## 0.6.0 (2024-01-29) + +If upgrading with Postgres 12 or Docker, see [these notes](https://github.com/pgvector/pgvector#060). + +- Added support for parallel index builds for HNSW +- Added validation for GUC parameters +- Changed storage for vector from `extended` to `external` +- Improved performance of HNSW +- Reduced memory usage for HNSW index builds +- Reduced WAL generation for HNSW index builds +- Fixed error with logical replication +- Fixed `invalid memory alloc request size` error with HNSW index builds +- Moved Docker image to `pgvector` org +- Added Docker tags for each supported version of Postgres +- Dropped support for Postgres 11 + +## 0.5.1 (2023-10-10) + +- Improved performance of HNSW index builds +- Added check for MVCC-compliant snapshot for index scans + +## 0.5.0 (2023-08-28) + +- Added HNSW index type +- Added support for parallel index builds for IVFFlat +- Added `l1_distance` function +- Added element-wise multiplication for vectors +- Added `sum` aggregate +- Improved performance of distance functions +- Fixed out of range results for cosine distance +- Fixed results for NULL and NaN distances for IVFFlat + ## 0.4.4 (2023-06-12) - Improved error message for malformed vector literal @@ -28,7 +116,7 @@ ## 0.4.0 (2023-01-11) -If upgrading with Postgres < 13, see [this note](https://github.com/pgvector/pgvector#040). +If upgrading with Postgres < 13, see [this note](https://github.com/pgvector/pgvector/blob/v0.4.0/README.md#040). - Changed text representation for vector elements to match `real` - Changed storage for vector from `plain` to `extended` @@ -45,7 +133,7 @@ If upgrading with Postgres < 13, see [this note](https://github.com/pgvector/pgv ## 0.3.1 (2022-11-02) -If upgrading from 0.2.7 or 0.3.0, [recreate](https://github.com/pgvector/pgvector#031) all `ivfflat` indexes after upgrading to ensure all data is indexed. +If upgrading from 0.2.7 or 0.3.0, [recreate](https://github.com/pgvector/pgvector/blob/v0.3.1/README.md#031) all `ivfflat` indexes after upgrading to ensure all data is indexed. - Fixed issue with inserts silently corrupting `ivfflat` indexes (introduced in 0.2.7) - Fixed segmentation fault with index creation when lists > 6500 diff --git a/src/postgres/third-party-extensions/pgvector/Dockerfile b/src/postgres/third-party-extensions/pgvector/Dockerfile index 6fe309954fa4..936440928190 100644 --- a/src/postgres/third-party-extensions/pgvector/Dockerfile +++ b/src/postgres/third-party-extensions/pgvector/Dockerfile @@ -1,10 +1,11 @@ -ARG PG_MAJOR=15 +ARG PG_MAJOR=17 FROM postgres:$PG_MAJOR ARG PG_MAJOR COPY . /tmp/pgvector RUN apt-get update && \ + apt-mark hold locales && \ apt-get install -y --no-install-recommends build-essential postgresql-server-dev-$PG_MAJOR && \ cd /tmp/pgvector && \ make clean && \ @@ -15,4 +16,5 @@ RUN apt-get update && \ rm -r /tmp/pgvector && \ apt-get remove -y build-essential postgresql-server-dev-$PG_MAJOR && \ apt-get autoremove -y && \ + apt-mark unhold locales && \ rm -rf /var/lib/apt/lists/* diff --git a/src/postgres/third-party-extensions/pgvector/LICENSE b/src/postgres/third-party-extensions/pgvector/LICENSE index 483e2b9ae90f..066b96ab018f 100644 --- a/src/postgres/third-party-extensions/pgvector/LICENSE +++ b/src/postgres/third-party-extensions/pgvector/LICENSE @@ -1,4 +1,4 @@ -Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group +Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group Portions Copyright (c) 1994, The Regents of the University of California diff --git a/src/postgres/third-party-extensions/pgvector/META.json b/src/postgres/third-party-extensions/pgvector/META.json index a71d81086cd7..b9a68f62a279 100644 --- a/src/postgres/third-party-extensions/pgvector/META.json +++ b/src/postgres/third-party-extensions/pgvector/META.json @@ -2,7 +2,7 @@ "name": "vector", "abstract": "Open-source vector similarity search for Postgres", "description": "Supports L2 distance, inner product, and cosine distance", - "version": "0.4.4", + "version": "0.8.0", "maintainer": [ "Andrew Kane " ], @@ -12,7 +12,7 @@ "prereqs": { "runtime": { "requires": { - "PostgreSQL": "11.0.0" + "PostgreSQL": "13.0.0" } } }, @@ -20,7 +20,7 @@ "vector": { "file": "sql/vector.sql", "docfile": "README.md", - "version": "0.4.4", + "version": "0.8.0", "abstract": "Open-source vector similarity search for Postgres" } }, diff --git a/src/postgres/third-party-extensions/pgvector/Makefile b/src/postgres/third-party-extensions/pgvector/Makefile index 1ca5ed76081d..c2ff6a0afcc1 100644 --- a/src/postgres/third-party-extensions/pgvector/Makefile +++ b/src/postgres/third-party-extensions/pgvector/Makefile @@ -1,17 +1,20 @@ EXTENSION = vector -EXTVERSION = 0.4.4-yb-1.2 +EXTVERSION = 0.8.0-yb-1.0 MODULE_big = vector -DATA = $(wildcard sql/*--*.sql) -OBJS = src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/vector.o src/ybvector/ybvector.o src/ybvector/ybvectorwrite.o src/ybvector/ybvectorread.o src/ybvector/ybvectorutil.o src/ybvector/ybdummyann.o src/ybvector/ybhnsw.o +DATA = $(wildcard sql/*--*--*.sql) +DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql +OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/sparsevec.o src/vector.o src/ybvector/ybvector.o src/ybvector/ybvectorwrite.o src/ybvector/ybvectorread.o src/ybvector/ybvectorutil.o src/ybvector/ybdummyann.o src/ybvector/ybhnsw.o +HEADERS = src/halfvec.h src/sparsevec.h src/vector.h src/ybvector/ybvector.h TESTS = $(wildcard test/sql/*.sql) REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) -REGRESS_OPTS = --inputdir=test --load-extension=vector +REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION) +# To compile for portability, run: make OPTFLAGS="" OPTFLAGS = -march=native -# Mac ARM doesn't support -march=native +# Mac ARM doesn't always support -march=native ifeq ($(shell uname -s), Darwin) ifeq ($(shell uname -p), arm) # no difference with -march=armv8.5-a @@ -40,8 +43,6 @@ all: sql/$(EXTENSION)--$(EXTVERSION).sql sql/$(EXTENSION)--$(EXTVERSION).sql: sql/$(EXTENSION).sql cp $< $@ -EXTRA_CLEAN = sql/$(EXTENSION)--$(EXTVERSION).sql - PG_CONFIG ?= pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) @@ -55,7 +56,7 @@ LDFLAGS += -lyb_pggate LDFLAGS += -lyb_pggate_util LDFLAGS += -L${BUILD_ROOT}/lib -# for Postgres 15 +# for Postgres < 15 PROVE_FLAGS += -I ./test/perl prove_installcheck: @@ -68,13 +69,15 @@ dist: mkdir -p dist git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master +# for Docker +PG_MAJOR ?= 17 + .PHONY: docker docker: - docker build --pull --no-cache --platform linux/amd64 -t ankane/pgvector:latest . + docker build --pull --no-cache --build-arg PG_MAJOR=$(PG_MAJOR) -t pgvector/pgvector:pg$(PG_MAJOR) -t pgvector/pgvector:$(EXTVERSION)-pg$(PG_MAJOR) . .PHONY: docker-release docker-release: - docker buildx build --push --pull --no-cache --platform linux/amd64,linux/arm64 -t ankane/pgvector:latest . - docker buildx build --push --platform linux/amd64,linux/arm64 -t ankane/pgvector:v$(EXTVERSION) . + docker buildx build --push --pull --no-cache --platform linux/amd64,linux/arm64 --build-arg PG_MAJOR=$(PG_MAJOR) -t pgvector/pgvector:pg$(PG_MAJOR) -t pgvector/pgvector:$(EXTVERSION)-pg$(PG_MAJOR) . diff --git a/src/postgres/third-party-extensions/pgvector/Makefile.win b/src/postgres/third-party-extensions/pgvector/Makefile.win index 6f111eff65d3..ea5df24b3cd5 100644 --- a/src/postgres/third-party-extensions/pgvector/Makefile.win +++ b/src/postgres/third-party-extensions/pgvector/Makefile.win @@ -1,10 +1,13 @@ +# YB Note: Not used in YB. EXTENSION = vector -EXTVERSION = 0.4.4-yb-1.2 +EXTVERSION = 0.8.0-yb-1.0 -OBJS = src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\vector.obj +DATA_built = sql\$(EXTENSION)--$(EXTVERSION).sql +OBJS = src\bitutils.obj src\bitvec.obj src\halfutils.obj src\halfvec.obj src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\sparsevec.obj src\vector.obj +HEADERS = src\halfvec.h src\sparsevec.h src\vector.h -REGRESS = btree cast copy functions input ivfflat_cosine ivfflat_ip ivfflat_l2 ivfflat_options ivfflat_unlogged -REGRESS_OPTS = --inputdir=test --load-extension=vector +REGRESS = bit btree cast copy halfvec hnsw_bit hnsw_halfvec hnsw_sparsevec hnsw_vector ivfflat_bit ivfflat_halfvec ivfflat_vector sparsevec vector_type +REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION) # For /arch flags # https://learn.microsoft.com/en-us/cpp/build/reference/arch-minimum-cpu-architecture @@ -18,11 +21,6 @@ PG_CFLAGS = $(PG_CFLAGS) $(OPTFLAGS) /O2 /fp:fast # https://learn.microsoft.com/en-us/cpp/error-messages/tool-errors/vectorizer-and-parallelizer-messages # PG_CFLAGS = $(PG_CFLAGS) /Qvec-report:2 -all: sql\$(EXTENSION)--$(EXTVERSION).sql - -sql\$(EXTENSION)--$(EXTVERSION).sql: sql\$(EXTENSION).sql - copy sql\$(EXTENSION).sql $@ - # TODO use pg_config !ifndef PGROOT !error PGROOT is not set @@ -42,18 +40,23 @@ SHLIB = $(EXTENSION).dll LIBS = "$(LIBDIR)\postgres.lib" +all: $(SHLIB) $(DATA_built) + .c.obj: $(CC) $(CFLAGS) /c $< /Fo$@ $(SHLIB): $(OBJS) $(CC) $(CFLAGS) $(OBJS) $(LIBS) /link /DLL /OUT:$(SHLIB) -all: $(SHLIB) +sql\$(EXTENSION)--$(EXTVERSION).sql: sql\$(EXTENSION).sql + copy sql\$(EXTENSION).sql $@ -install: +install: all copy $(SHLIB) "$(PKGLIBDIR)" copy $(EXTENSION).control "$(SHAREDIR)\extension" copy sql\$(EXTENSION)--*.sql "$(SHAREDIR)\extension" + mkdir "$(INCLUDEDIR_SERVER)\extension\$(EXTENSION)" + for %f in ($(HEADERS)) do copy %f "$(INCLUDEDIR_SERVER)\extension\$(EXTENSION)" installcheck: "$(BINDIR)\pg_regress" --bindir="$(BINDIR)" $(REGRESS_OPTS) $(REGRESS) @@ -61,10 +64,12 @@ installcheck: uninstall: del /f "$(PKGLIBDIR)\$(SHLIB)" del /f "$(SHAREDIR)\extension\$(EXTENSION).control" - del /f "$(SHAREDIR)\extension\vector--*.sql" + del /f "$(SHAREDIR)\extension\$(EXTENSION)--*.sql" + del /f "$(INCLUDEDIR_SERVER)\extension\$(EXTENSION)\*.h" + rmdir "$(INCLUDEDIR_SERVER)\extension\$(EXTENSION)" clean: del /f $(SHLIB) $(EXTENSION).lib $(EXTENSION).exp + del /f $(DATA_built) del /f $(OBJS) - del /f sql\$(EXTENSION)--$(EXTVERSION).sql del /f /s /q results regression.diffs regression.out tmp_check tmp_check_iso log output_iso diff --git a/src/postgres/third-party-extensions/pgvector/README.md b/src/postgres/third-party-extensions/pgvector/README.md index be02c961efeb..0162b30cdc6f 100644 --- a/src/postgres/third-party-extensions/pgvector/README.md +++ b/src/postgres/third-party-extensions/pgvector/README.md @@ -2,31 +2,61 @@ Open-source vector similarity search for Postgres -Supports +Store your vectors with the rest of your data. Supports: - exact and approximate nearest neighbor search -- L2 distance, inner product, and cosine distance +- single-precision, half-precision, binary, and sparse vectors +- L2 distance, inner product, cosine distance, L1 distance, Hamming distance, and Jaccard distance - any [language](#languages) with a Postgres client Plus [ACID](https://en.wikipedia.org/wiki/ACID) compliance, point-in-time recovery, JOINs, and all of the other [great features](https://www.postgresql.org/about/) of Postgres -[![Build Status](https://github.com/pgvector/pgvector/workflows/build/badge.svg?branch=master)](https://github.com/pgvector/pgvector/actions) +[![Build Status](https://github.com/pgvector/pgvector/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector/actions) ## Installation -Compile and install the extension (supports Postgres 11+) +### Linux and Mac + +Compile and install the extension (supports Postgres 13+) ```sh cd /tmp -git clone --branch v0.4.4 https://github.com/pgvector/pgvector.git +git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git cd pgvector make make install # may need sudo ``` -See the [installation notes](#installation-notes) if you run into issues +See the [installation notes](#installation-notes---linux-and-mac) if you run into issues + +You can also install it with [Docker](#docker), [Homebrew](#homebrew), [PGXN](#pgxn), [APT](#apt), [Yum](#yum), [pkg](#pkg), or [conda-forge](#conda-forge), and it comes preinstalled with [Postgres.app](#postgresapp) and many [hosted providers](#hosted-postgres). There are also instructions for [GitHub Actions](https://github.com/pgvector/setup-pgvector). + +### Windows + +Ensure [C++ support in Visual Studio](https://learn.microsoft.com/en-us/cpp/build/building-on-the-command-line?view=msvc-170#download-and-install-the-tools) is installed, and run: + +```cmd +call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" +``` -You can also install it with [Docker](#docker), [Homebrew](#homebrew), [PGXN](#pgxn), [APT](#apt), [Yum](#yum), or [conda-forge](#conda-forge), and it comes preinstalled with [Postgres.app](#postgresapp) and many [hosted providers](#hosted-postgres) +Note: The exact path will vary depending on your Visual Studio version and edition + +Then use `nmake` to build: + +```cmd +set "PGROOT=C:\Program Files\PostgreSQL\16" +cd %TEMP% +git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git +cd pgvector +nmake /F Makefile.win +nmake /F Makefile.win install +``` + +Note: Postgres 17 is not supported yet due to an upstream issue + +See the [installation notes](#installation-notes---windows) if you run into issues + +You can also install it with [Docker](#docker) or [conda-forge](#conda-forge). ## Getting Started @@ -54,7 +84,7 @@ Get the nearest neighbors by L2 distance SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; ``` -Also supports inner product (`<#>`) and cosine distance (`<=>`) +Also supports inner product (`<#>`), cosine distance (`<=>`), and L1 distance (`<+>`, added in 0.7.0) Note: `<#>` returns the negative inner product since Postgres only supports `ASC` order index scans on operators @@ -72,12 +102,20 @@ Or add a vector column to an existing table ALTER TABLE items ADD COLUMN embedding vector(3); ``` +Also supports [half-precision](#half-precision-vectors), [binary](#binary-vectors), and [sparse](#sparse-vectors) vectors + Insert vectors ```sql INSERT INTO items (embedding) VALUES ('[1,2,3]'), ('[4,5,6]'); ``` +Or load vectors in bulk using `COPY` ([example](https://github.com/pgvector/pgvector-python/blob/master/examples/loading/example.py)) + +```sql +COPY items (embedding) FROM STDIN WITH (FORMAT BINARY); +``` + Upsert vectors ```sql @@ -105,6 +143,15 @@ Get the nearest neighbors to a vector SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; ``` +Supported distance functions are: + +- `<->` - L2 distance +- `<#>` - (negative) inner product +- `<=>` - cosine distance +- `<+>` - L1 distance (added in 0.7.0) +- `<~>` - Hamming distance (binary vectors, added in 0.7.0) +- `<%>` - Jaccard distance (binary vectors, added in 0.7.0) + Get the nearest neighbors to a row ```sql @@ -157,7 +204,140 @@ SELECT category_id, AVG(embedding) FROM items GROUP BY category_id; By default, pgvector performs exact nearest neighbor search, which provides perfect recall. -You can add an index to use approximate nearest neighbor search, which trades some recall for performance. Unlike typical indexes, you will see different results for queries after adding an approximate index. +You can add an index to use approximate nearest neighbor search, which trades some recall for speed. Unlike typical indexes, you will see different results for queries after adding an approximate index. + +Supported index types are: + +- [HNSW](#hnsw) +- [IVFFlat](#ivfflat) + +## HNSW + +An HNSW index creates a multilayer graph. It has better query performance than IVFFlat (in terms of speed-recall tradeoff), but has slower build times and uses more memory. Also, an index can be created without any data in the table since there isn’t a training step like IVFFlat. + +Add an index for each distance function you want to use. + +L2 distance + +```sql +CREATE INDEX ON items USING hnsw (embedding vector_l2_ops); +``` + +Note: Use `halfvec_l2_ops` for `halfvec` and `sparsevec_l2_ops` for `sparsevec` (and similar with the other distance functions) + +Inner product + +```sql +CREATE INDEX ON items USING hnsw (embedding vector_ip_ops); +``` + +Cosine distance + +```sql +CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops); +``` + +L1 distance - added in 0.7.0 + +```sql +CREATE INDEX ON items USING hnsw (embedding vector_l1_ops); +``` + +Hamming distance - added in 0.7.0 + +```sql +CREATE INDEX ON items USING hnsw (embedding bit_hamming_ops); +``` + +Jaccard distance - added in 0.7.0 + +```sql +CREATE INDEX ON items USING hnsw (embedding bit_jaccard_ops); +``` + +Supported types are: + +- `vector` - up to 2,000 dimensions +- `halfvec` - up to 4,000 dimensions (added in 0.7.0) +- `bit` - up to 64,000 dimensions (added in 0.7.0) +- `sparsevec` - up to 1,000 non-zero elements (added in 0.7.0) + +### Index Options + +Specify HNSW parameters + +- `m` - the max number of connections per layer (16 by default) +- `ef_construction` - the size of the dynamic candidate list for constructing the graph (64 by default) + +```sql +CREATE INDEX ON items USING hnsw (embedding vector_l2_ops) WITH (m = 16, ef_construction = 64); +``` + +A higher value of `ef_construction` provides better recall at the cost of index build time / insert speed. + +### Query Options + +Specify the size of the dynamic candidate list for search (40 by default) + +```sql +SET hnsw.ef_search = 100; +``` + +A higher value provides better recall at the cost of speed. + +Use `SET LOCAL` inside a transaction to set it for a single query + +```sql +BEGIN; +SET LOCAL hnsw.ef_search = 100; +SELECT ... +COMMIT; +``` + +### Index Build Time + +Indexes build significantly faster when the graph fits into `maintenance_work_mem` + +```sql +SET maintenance_work_mem = '8GB'; +``` + +A notice is shown when the graph no longer fits + +```text +NOTICE: hnsw graph no longer fits into maintenance_work_mem after 100000 tuples +DETAIL: Building will take significantly more time. +HINT: Increase maintenance_work_mem to speed up builds. +``` + +Note: Do not set `maintenance_work_mem` so high that it exhausts the memory on the server + +Like other index types, it’s faster to create an index after loading your initial data + +Starting with 0.6.0, you can also speed up index creation by increasing the number of parallel workers (2 by default) + +```sql +SET max_parallel_maintenance_workers = 7; -- plus leader +``` + +For a large number of workers, you may also need to increase `max_parallel_workers` (8 by default) + +### Indexing Progress + +Check [indexing progress](https://www.postgresql.org/docs/current/progress-reporting.html#CREATE-INDEX-PROGRESS-REPORTING) + +```sql +SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index; +``` + +The phases for HNSW are: + +1. `initializing` +2. `loading tuples` + +## IVFFlat + +An IVFFlat index divides vectors into lists, and then searches a subset of those lists that are closest to the query vector. It has faster build times and uses less memory than HNSW, but has lower query performance (in terms of speed-recall tradeoff). Three keys to achieving good recall are: @@ -173,6 +353,8 @@ L2 distance CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100); ``` +Note: Use `halfvec_l2_ops` for `halfvec` (and similar with the other distance functions) + Inner product ```sql @@ -185,7 +367,17 @@ Cosine distance CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); ``` -Vectors with up to 2,000 dimensions can be indexed. +Hamming distance - added in 0.7.0 + +```sql +CREATE INDEX ON items USING ivfflat (embedding bit_hamming_ops) WITH (lists = 100); +``` + +Supported types are: + +- `vector` - up to 2,000 dimensions +- `halfvec` - up to 4,000 dimensions (added in 0.7.0) +- `bit` - up to 64,000 dimensions (added in 0.7.0) ### Query Options @@ -206,68 +398,334 @@ SELECT ... COMMIT; ``` +### Index Build Time + +Speed up index creation on large tables by increasing the number of parallel workers (2 by default) + +```sql +SET max_parallel_maintenance_workers = 7; -- plus leader +``` + +For a large number of workers, you may also need to increase `max_parallel_workers` (8 by default) + ### Indexing Progress -Check [indexing progress](https://www.postgresql.org/docs/current/progress-reporting.html#CREATE-INDEX-PROGRESS-REPORTING) with Postgres 12+ +Check [indexing progress](https://www.postgresql.org/docs/current/progress-reporting.html#CREATE-INDEX-PROGRESS-REPORTING) ```sql -SELECT phase, tuples_done, tuples_total FROM pg_stat_progress_create_index; +SELECT phase, round(100.0 * tuples_done / nullif(tuples_total, 0), 1) AS "%" FROM pg_stat_progress_create_index; ``` -The phases are: +The phases for IVFFlat are: 1. `initializing` 2. `performing k-means` -3. `sorting tuples` +3. `assigning tuples` 4. `loading tuples` -Note: `tuples_done` and `tuples_total` are only populated during the `loading tuples` phase +Note: `%` is only populated during the `loading tuples` phase -### Filtering +## Filtering -There are a few ways to index nearest neighbor queries with a `WHERE` clause +There are a few ways to index nearest neighbor queries with a `WHERE` clause. ```sql SELECT * FROM items WHERE category_id = 123 ORDER BY embedding <-> '[3,1,2]' LIMIT 5; ``` -Create an index on one [or more](https://www.postgresql.org/docs/current/indexes-multicolumn.html) of the `WHERE` columns for exact search +A good place to start is creating an index on the filter column. This can provide fast, exact nearest neighbor search in many cases. Postgres has a number of [index types](https://www.postgresql.org/docs/current/indexes-types.html) for this: B-tree (default), hash, GiST, SP-GiST, GIN, and BRIN. ```sql CREATE INDEX ON items (category_id); ``` -Or a [partial index](https://www.postgresql.org/docs/current/indexes-partial.html) on the vector column for approximate search +For multiple columns, consider a [multicolumn index](https://www.postgresql.org/docs/current/indexes-multicolumn.html). + +```sql +CREATE INDEX ON items (location_id, category_id); +``` + +Exact indexes work well for conditions that match a low percentage of rows. Otherwise, [approximate indexes](#indexing) can work better. + +```sql +CREATE INDEX ON items USING hnsw (embedding vector_l2_ops); +``` + +With approximate indexes, filtering is applied *after* the index is scanned. If a condition matches 10% of rows, with HNSW and the default `hnsw.ef_search` of 40, only 4 rows will match on average. For more rows, increase `hnsw.ef_search`. ```sql -CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100) - WHERE (category_id = 123); +SET hnsw.ef_search = 200; ``` -Use [partitioning](https://www.postgresql.org/docs/current/ddl-partitioning.html) for approximate search on many different values of the `WHERE` columns +Starting with 0.8.0, you can enable [iterative index scans](#iterative-index-scans), which will automatically scan more of the index when needed. + +```sql +SET hnsw.iterative_scan = strict_order; +``` + +If filtering by only a few distinct values, consider [partial indexing](https://www.postgresql.org/docs/current/indexes-partial.html). + +```sql +CREATE INDEX ON items USING hnsw (embedding vector_l2_ops) WHERE (category_id = 123); +``` + +If filtering by many different values, consider [partitioning](https://www.postgresql.org/docs/current/ddl-partitioning.html). ```sql CREATE TABLE items (embedding vector(3), category_id int) PARTITION BY LIST(category_id); ``` +## Iterative Index Scans + +*Added in 0.8.0* + +With approximate indexes, queries with filtering can return less results since filtering is applied *after* the index is scanned. Starting with 0.8.0, you can enable iterative index scans, which will automatically scan more of the index until enough results are found (or it reaches `hnsw.max_scan_tuples` or `ivfflat.max_probes`). + +Iterative scans can use strict or relaxed ordering. + +Strict ensures results are in the exact order by distance + +```sql +SET hnsw.iterative_scan = strict_order; +``` + +Relaxed allows results to be slightly out of order by distance, but provides better recall + +```sql +SET hnsw.iterative_scan = relaxed_order; +# or +SET ivfflat.iterative_scan = relaxed_order; +``` + +With relaxed ordering, you can use a [materialized CTE](https://www.postgresql.org/docs/current/queries-with.html#QUERIES-WITH-CTE-MATERIALIZATION) to get strict ordering + +```sql +WITH relaxed_results AS MATERIALIZED ( + SELECT id, embedding <-> '[1,2,3]' AS distance FROM items WHERE category_id = 123 ORDER BY distance LIMIT 5 +) SELECT * FROM relaxed_results ORDER BY distance; +``` + +For queries that filter by distance, use a materialized CTE and place the distance filter outside of it for best performance (due to the [current behavior](https://www.postgresql.org/message-id/flat/CAOdR5yGUoMQ6j7M5hNUXrySzaqZVGf_Ne%2B8fwZMRKTFxU1nbJg%40mail.gmail.com) of the Postgres executor) + +```sql +WITH nearest_results AS MATERIALIZED ( + SELECT id, embedding <-> '[1,2,3]' AS distance FROM items ORDER BY distance LIMIT 5 +) SELECT * FROM nearest_results WHERE distance < 5 ORDER BY distance; +``` + +Note: Place any other filters inside the CTE + +### Iterative Scan Options + +Since scanning a large portion of an approximate index is expensive, there are options to control when a scan ends. + +#### HNSW + +Specify the max number of tuples to visit (20,000 by default) + +```sql +SET hnsw.max_scan_tuples = 20000; +``` + +Note: This is approximate and does not affect the initial scan + +Specify the max amount of memory to use, as a multiple of `work_mem` (1 by default) + +```sql +SET hnsw.scan_mem_multiplier = 2; +``` + +Note: Try increasing this if increasing `hnsw.max_scan_tuples` does not improve recall + +#### IVFFlat + +Specify the max number of probes + +```sql +SET ivfflat.max_probes = 100; +``` + +Note: If this is lower than `ivfflat.probes`, `ivfflat.probes` will be used + +## Half-Precision Vectors + +*Added in 0.7.0* + +Use the `halfvec` type to store half-precision vectors + +```sql +CREATE TABLE items (id bigserial PRIMARY KEY, embedding halfvec(3)); +``` + +## Half-Precision Indexing + +*Added in 0.7.0* + +Index vectors at half precision for smaller indexes + +```sql +CREATE INDEX ON items USING hnsw ((embedding::halfvec(3)) halfvec_l2_ops); +``` + +Get the nearest neighbors + +```sql +SELECT * FROM items ORDER BY embedding::halfvec(3) <-> '[1,2,3]' LIMIT 5; +``` + +## Binary Vectors + +Use the `bit` type to store binary vectors ([example](https://github.com/pgvector/pgvector-python/blob/master/examples/imagehash/example.py)) + +```sql +CREATE TABLE items (id bigserial PRIMARY KEY, embedding bit(3)); +INSERT INTO items (embedding) VALUES ('000'), ('111'); +``` + +Get the nearest neighbors by Hamming distance (added in 0.7.0) + +```sql +SELECT * FROM items ORDER BY embedding <~> '101' LIMIT 5; +``` + +Or (before 0.7.0) + +```sql +SELECT * FROM items ORDER BY bit_count(embedding # '101') LIMIT 5; +``` + +Also supports Jaccard distance (`<%>`) + +## Binary Quantization + +*Added in 0.7.0* + +Use expression indexing for binary quantization + +```sql +CREATE INDEX ON items USING hnsw ((binary_quantize(embedding)::bit(3)) bit_hamming_ops); +``` + +Get the nearest neighbors by Hamming distance + +```sql +SELECT * FROM items ORDER BY binary_quantize(embedding)::bit(3) <~> binary_quantize('[1,-2,3]') LIMIT 5; +``` + +Re-rank by the original vectors for better recall + +```sql +SELECT * FROM ( + SELECT * FROM items ORDER BY binary_quantize(embedding)::bit(3) <~> binary_quantize('[1,-2,3]') LIMIT 20 +) ORDER BY embedding <=> '[1,-2,3]' LIMIT 5; +``` + +## Sparse Vectors + +*Added in 0.7.0* + +Use the `sparsevec` type to store sparse vectors + +```sql +CREATE TABLE items (id bigserial PRIMARY KEY, embedding sparsevec(5)); +``` + +Insert vectors + +```sql +INSERT INTO items (embedding) VALUES ('{1:1,3:2,5:3}/5'), ('{1:4,3:5,5:6}/5'); +``` + +The format is `{index1:value1,index2:value2}/dimensions` and indices start at 1 like SQL arrays + +Get the nearest neighbors by L2 distance + +```sql +SELECT * FROM items ORDER BY embedding <-> '{1:3,3:1,5:2}/5' LIMIT 5; +``` + ## Hybrid Search -Use together with Postgres [full-text search](https://www.postgresql.org/docs/current/textsearch-intro.html) for hybrid search ([Python example](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py)). +Use together with Postgres [full-text search](https://www.postgresql.org/docs/current/textsearch-intro.html) for hybrid search. ```sql -SELECT id, content FROM items, to_tsquery('hello & search') query +SELECT id, content FROM items, plainto_tsquery('hello search') query WHERE textsearch @@ query ORDER BY ts_rank_cd(textsearch, query) DESC LIMIT 5; ``` +You can use [Reciprocal Rank Fusion](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/rrf.py) or a [cross-encoder](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/cross_encoder.py) to combine results. + +## Indexing Subvectors + +*Added in 0.7.0* + +Use expression indexing to index subvectors + +```sql +CREATE INDEX ON items USING hnsw ((subvector(embedding, 1, 3)::vector(3)) vector_cosine_ops); +``` + +Get the nearest neighbors by cosine distance + +```sql +SELECT * FROM items ORDER BY subvector(embedding, 1, 3)::vector(3) <=> subvector('[1,2,3,4,5]'::vector, 1, 3) LIMIT 5; +``` + +Re-rank by the full vectors for better recall + +```sql +SELECT * FROM ( + SELECT * FROM items ORDER BY subvector(embedding, 1, 3)::vector(3) <=> subvector('[1,2,3,4,5]'::vector, 1, 3) LIMIT 20 +) ORDER BY embedding <=> '[1,2,3,4,5]' LIMIT 5; +``` + ## Performance +### Tuning + +Use a tool like [PgTune](https://pgtune.leopard.in.ua/) to set initial values for Postgres server parameters. For instance, `shared_buffers` should typically be 25% of the server’s memory. You can find the config file with: + +```sql +SHOW config_file; +``` + +And check individual settings with: + +```sql +SHOW shared_buffers; +``` + +Be sure to restart Postgres for changes to take effect. + +### Loading + +Use `COPY` for bulk loading data ([example](https://github.com/pgvector/pgvector-python/blob/master/examples/loading/example.py)). + +```sql +COPY items (embedding) FROM STDIN WITH (FORMAT BINARY); +``` + +Add any indexes *after* loading the initial data for best performance. + +### Indexing + +See index build time for [HNSW](#index-build-time) and [IVFFlat](#index-build-time-1). + +In production environments, create indexes concurrently to avoid blocking writes. + +```sql +CREATE INDEX CONCURRENTLY ... +``` + +### Querying + Use `EXPLAIN ANALYZE` to debug performance. ```sql EXPLAIN ANALYZE SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; ``` -### Exact Search +#### Exact Search To speed up queries without an index, increase `max_parallel_workers_per_gather`. @@ -281,30 +739,79 @@ If vectors are normalized to length 1 (like [OpenAI embeddings](https://platform SELECT * FROM items ORDER BY embedding <#> '[3,1,2]' LIMIT 5; ``` -### Approximate Search +#### Approximate Search -To speed up queries with an index, increase the number of inverted lists (at the expense of recall). +To speed up queries with an IVFFlat index, increase the number of inverted lists (at the expense of recall). ```sql CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 1000); ``` +### Vacuuming + +Vacuuming can take a while for HNSW indexes. Speed it up by reindexing first. + +```sql +REINDEX INDEX CONCURRENTLY index_name; +VACUUM table_name; +``` + +## Monitoring + +Monitor performance with [pg_stat_statements](https://www.postgresql.org/docs/current/pgstatstatements.html) (be sure to add it to `shared_preload_libraries`). + +```sql +CREATE EXTENSION pg_stat_statements; +``` + +Get the most time-consuming queries with: + +```sql +SELECT query, calls, ROUND((total_plan_time + total_exec_time) / calls) AS avg_time_ms, + ROUND((total_plan_time + total_exec_time) / 60000) AS total_time_min + FROM pg_stat_statements ORDER BY total_plan_time + total_exec_time DESC LIMIT 20; +``` + +Note: Replace `total_plan_time + total_exec_time` with `total_time` for Postgres < 13 + +Monitor recall by comparing results from approximate search with exact search. + +```sql +BEGIN; +SET LOCAL enable_indexscan = off; -- use exact search +SELECT ... +COMMIT; +``` + +## Scaling + +Scale pgvector the same way you scale Postgres. + +Scale vertically by increasing memory, CPU, and storage on a single instance. Use existing tools to [tune parameters](#tuning) and [monitor performance](#monitoring). + +Scale horizontally with [replicas](https://www.postgresql.org/docs/current/hot-standby.html), or use [Citus](https://github.com/citusdata/citus) or another approach for sharding ([example](https://github.com/pgvector/pgvector-python/blob/master/examples/citus/example.py)). + ## Languages Use pgvector from any language with a Postgres client. You can even generate and store vectors in one language and query them in another. Language | Libraries / Examples --- | --- +C | [pgvector-c](https://github.com/pgvector/pgvector-c) C++ | [pgvector-cpp](https://github.com/pgvector/pgvector-cpp) -C# | [pgvector-dotnet](https://github.com/pgvector/pgvector-dotnet) +C#, F#, Visual Basic | [pgvector-dotnet](https://github.com/pgvector/pgvector-dotnet) Crystal | [pgvector-crystal](https://github.com/pgvector/pgvector-crystal) +Dart | [pgvector-dart](https://github.com/pgvector/pgvector-dart) Elixir | [pgvector-elixir](https://github.com/pgvector/pgvector-elixir) Go | [pgvector-go](https://github.com/pgvector/pgvector-go) Haskell | [pgvector-haskell](https://github.com/pgvector/pgvector-haskell) -Java, Scala | [pgvector-java](https://github.com/pgvector/pgvector-java) +Java, Kotlin, Groovy, Scala | [pgvector-java](https://github.com/pgvector/pgvector-java) +JavaScript, TypeScript | [pgvector-node](https://github.com/pgvector/pgvector-node) Julia | [pgvector-julia](https://github.com/pgvector/pgvector-julia) +Lisp | [pgvector-lisp](https://github.com/pgvector/pgvector-lisp) Lua | [pgvector-lua](https://github.com/pgvector/pgvector-lua) -Node.js | [pgvector-node](https://github.com/pgvector/pgvector-node) +Nim | [pgvector-nim](https://github.com/pgvector/pgvector-nim) +OCaml | [pgvector-ocaml](https://github.com/pgvector/pgvector-ocaml) Perl | [pgvector-perl](https://github.com/pgvector/pgvector-perl) PHP | [pgvector-php](https://github.com/pgvector/pgvector-php) Python | [pgvector-python](https://github.com/pgvector/pgvector-python) @@ -312,6 +819,7 @@ R | [pgvector-r](https://github.com/pgvector/pgvector-r) Ruby | [pgvector-ruby](https://github.com/pgvector/pgvector-ruby), [Neighbor](https://github.com/ankane/neighbor) Rust | [pgvector-rust](https://github.com/pgvector/pgvector-rust) Swift | [pgvector-swift](https://github.com/pgvector/pgvector-swift) +Zig | [pgvector-zig](https://github.com/pgvector/pgvector-zig) ## Frequently Asked Questions @@ -325,52 +833,259 @@ Yes, pgvector uses the write-ahead log (WAL), which allows for replication and p #### What if I want to index vectors with more than 2,000 dimensions? -You’ll need to use [dimensionality reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction) at the moment. +You can use [half-precision indexing](#half-precision-indexing) to index up to 4,000 dimensions or [binary quantization](#binary-quantization) to index up to 64,000 dimensions. Another option is [dimensionality reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction). + +#### Can I store vectors with different dimensions in the same column? + +You can use `vector` as the type (instead of `vector(3)`). + +```sql +CREATE TABLE embeddings (model_id bigint, item_id bigint, embedding vector, PRIMARY KEY (model_id, item_id)); +``` + +However, you can only create indexes on rows with the same number of dimensions (using [expression](https://www.postgresql.org/docs/current/indexes-expressional.html) and [partial](https://www.postgresql.org/docs/current/indexes-partial.html) indexing): + +```sql +CREATE INDEX ON embeddings USING hnsw ((embedding::vector(3)) vector_l2_ops) WHERE (model_id = 123); +``` + +and query with: + +```sql +SELECT * FROM embeddings WHERE model_id = 123 ORDER BY embedding::vector(3) <-> '[3,1,2]' LIMIT 5; +``` + +#### Can I store vectors with more precision? + +You can use the `double precision[]` or `numeric[]` type to store vectors with more precision. + +```sql +CREATE TABLE items (id bigserial PRIMARY KEY, embedding double precision[]); + +-- use {} instead of [] for Postgres arrays +INSERT INTO items (embedding) VALUES ('{1,2,3}'), ('{4,5,6}'); +``` + +Optionally, add a [check constraint](https://www.postgresql.org/docs/current/ddl-constraints.html) to ensure data can be converted to the `vector` type and has the expected dimensions. + +```sql +ALTER TABLE items ADD CHECK (vector_dims(embedding::vector) = 3); +``` + +Use [expression indexing](https://www.postgresql.org/docs/current/indexes-expressional.html) to index (at a lower precision): + +```sql +CREATE INDEX ON items USING hnsw ((embedding::vector(3)) vector_l2_ops); +``` + +and query with: + +```sql +SELECT * FROM items ORDER BY embedding::vector(3) <-> '[3,1,2]' LIMIT 5; +``` + +#### Do indexes need to fit into memory? + +No, but like other index types, you’ll likely see better performance if they do. You can get the size of an index with: + +```sql +SELECT pg_size_pretty(pg_relation_size('index_name')); +``` + +## Troubleshooting + +#### Why isn’t a query using an index? + +The query needs to have an `ORDER BY` and `LIMIT`, and the `ORDER BY` must be the result of a distance operator (not an expression) in ascending order. + +```sql +-- index +ORDER BY embedding <=> '[3,1,2]' LIMIT 5; + +-- no index +ORDER BY 1 - (embedding <=> '[3,1,2]') DESC LIMIT 5; +``` + +You can encourage the planner to use an index for a query with: + +```sql +BEGIN; +SET LOCAL enable_seqscan = off; +SELECT ... +COMMIT; +``` + +Also, if the table is small, a table scan may be faster. + +#### Why isn’t a query using a parallel table scan? + +The planner doesn’t consider [out-of-line storage](https://www.postgresql.org/docs/current/storage-toast.html) in cost estimates, which can make a serial scan look cheaper. You can reduce the cost of a parallel scan for a query with: + +```sql +BEGIN; +SET LOCAL min_parallel_table_scan_size = 1; +SET LOCAL parallel_setup_cost = 1; +SELECT ... +COMMIT; +``` + +or choose to store vectors inline: + +```sql +ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN; +``` + +#### Why are there less results for a query after adding an HNSW index? + +Results are limited by the size of the dynamic candidate list (`hnsw.ef_search`). There may be even less results due to dead tuples or filtering conditions in the query. We recommend setting `hnsw.ef_search` to at least twice the `LIMIT` of the query. If you need more than 500 results, use an IVFFlat index instead. -#### Why am I seeing less results after adding an index? +Also, note that `NULL` vectors are not indexed (as well as zero vectors for cosine distance). + +#### Why are there less results for a query after adding an IVFFlat index? The index was likely created with too little data for the number of lists. Drop the index until the table has more data. +```sql +DROP INDEX index_name; +``` + +Results can also be limited by the number of probes (`ivfflat.probes`). + +Also, note that `NULL` vectors are not indexed (as well as zero vectors for cosine distance). + ## Reference +- [Vector](#vector-type) +- [Halfvec](#halfvec-type) +- [Bit](#bit-type) +- [Sparsevec](#sparsevec-type) + ### Vector Type -Each vector takes `4 * dimensions + 8` bytes of storage. Each element is a single precision floating-point number (like the `real` type in Postgres), and all elements must be finite (no `NaN`, `Infinity` or `-Infinity`). Vectors can have up to 16,000 dimensions. +Each vector takes `4 * dimensions + 8` bytes of storage. Each element is a single-precision floating-point number (like the `real` type in Postgres), and all elements must be finite (no `NaN`, `Infinity` or `-Infinity`). Vectors can have up to 16,000 dimensions. ### Vector Operators -Operator | Description ---- | --- -\+ | element-wise addition -\- | element-wise subtraction -<-> | Euclidean distance -<#> | negative inner product -<=> | cosine distance +Operator | Description | Added +--- | --- | --- +\+ | element-wise addition | +\- | element-wise subtraction | +\* | element-wise multiplication | 0.5.0 +\|\| | concatenate | 0.7.0 +<-> | Euclidean distance | +<#> | negative inner product | +<=> | cosine distance | +<+> | taxicab distance | 0.7.0 ### Vector Functions -Function | Description ---- | --- -cosine_distance(vector, vector) → double precision | cosine distance -inner_product(vector, vector) → double precision | inner product -l2_distance(vector, vector) → double precision | Euclidean distance -vector_dims(vector) → integer | number of dimensions -vector_norm(vector) → double precision | Euclidean norm +Function | Description | Added +--- | --- | --- +binary_quantize(vector) → bit | binary quantize | 0.7.0 +cosine_distance(vector, vector) → double precision | cosine distance | +inner_product(vector, vector) → double precision | inner product | +l1_distance(vector, vector) → double precision | taxicab distance | 0.5.0 +l2_distance(vector, vector) → double precision | Euclidean distance | +l2_normalize(vector) → vector | Normalize with Euclidean norm | 0.7.0 +subvector(vector, integer, integer) → vector | subvector | 0.7.0 +vector_dims(vector) → integer | number of dimensions | +vector_norm(vector) → double precision | Euclidean norm | -### Aggregate Functions +### Vector Aggregate Functions -Function | Description ---- | --- -avg(vector) → vector | arithmetic mean +Function | Description | Added +--- | --- | --- +avg(vector) → vector | average | +sum(vector) → vector | sum | 0.5.0 + +### Halfvec Type + +Each half vector takes `2 * dimensions + 8` bytes of storage. Each element is a half-precision floating-point number, and all elements must be finite (no `NaN`, `Infinity` or `-Infinity`). Half vectors can have up to 16,000 dimensions. + +### Halfvec Operators + +Operator | Description | Added +--- | --- | --- +\+ | element-wise addition | 0.7.0 +\- | element-wise subtraction | 0.7.0 +\* | element-wise multiplication | 0.7.0 +\|\| | concatenate | 0.7.0 +<-> | Euclidean distance | 0.7.0 +<#> | negative inner product | 0.7.0 +<=> | cosine distance | 0.7.0 +<+> | taxicab distance | 0.7.0 + +### Halfvec Functions + +Function | Description | Added +--- | --- | --- +binary_quantize(halfvec) → bit | binary quantize | 0.7.0 +cosine_distance(halfvec, halfvec) → double precision | cosine distance | 0.7.0 +inner_product(halfvec, halfvec) → double precision | inner product | 0.7.0 +l1_distance(halfvec, halfvec) → double precision | taxicab distance | 0.7.0 +l2_distance(halfvec, halfvec) → double precision | Euclidean distance | 0.7.0 +l2_norm(halfvec) → double precision | Euclidean norm | 0.7.0 +l2_normalize(halfvec) → halfvec | Normalize with Euclidean norm | 0.7.0 +subvector(halfvec, integer, integer) → halfvec | subvector | 0.7.0 +vector_dims(halfvec) → integer | number of dimensions | 0.7.0 + +### Halfvec Aggregate Functions + +Function | Description | Added +--- | --- | --- +avg(halfvec) → halfvec | average | 0.7.0 +sum(halfvec) → halfvec | sum | 0.7.0 + +### Bit Type + +Each bit vector takes `dimensions / 8 + 8` bytes of storage. See the [Postgres docs](https://www.postgresql.org/docs/current/datatype-bit.html) for more info. + +### Bit Operators + +Operator | Description | Added +--- | --- | --- +<~> | Hamming distance | 0.7.0 +<%> | Jaccard distance | 0.7.0 + +### Bit Functions + +Function | Description | Added +--- | --- | --- +hamming_distance(bit, bit) → double precision | Hamming distance | 0.7.0 +jaccard_distance(bit, bit) → double precision | Jaccard distance | 0.7.0 + +### Sparsevec Type + +Each sparse vector takes `8 * non-zero elements + 16` bytes of storage. Each element is a single-precision floating-point number, and all elements must be finite (no `NaN`, `Infinity` or `-Infinity`). Sparse vectors can have up to 16,000 non-zero elements. + +### Sparsevec Operators + +Operator | Description | Added +--- | --- | --- +<-> | Euclidean distance | 0.7.0 +<#> | negative inner product | 0.7.0 +<=> | cosine distance | 0.7.0 +<+> | taxicab distance | 0.7.0 -## Installation Notes +### Sparsevec Functions + +Function | Description | Added +--- | --- | --- +cosine_distance(sparsevec, sparsevec) → double precision | cosine distance | 0.7.0 +inner_product(sparsevec, sparsevec) → double precision | inner product | 0.7.0 +l1_distance(sparsevec, sparsevec) → double precision | taxicab distance | 0.7.0 +l2_distance(sparsevec, sparsevec) → double precision | Euclidean distance | 0.7.0 +l2_norm(sparsevec) → double precision | Euclidean norm | 0.7.0 +l2_normalize(sparsevec) → sparsevec | Normalize with Euclidean norm | 0.7.0 + +## Installation Notes - Linux and Mac ### Postgres Location If your machine has multiple Postgres installations, specify the path to [pg_config](https://www.postgresql.org/docs/current/app-pgconfig.html) with: ```sh -export PG_CONFIG=/Applications/Postgres.app/Contents/Versions/latest/bin/pg_config +export PG_CONFIG=/Library/PostgreSQL/17/bin/pg_config ``` Then re-run the installation instructions (run `make clean` before `make` if needed). If `sudo` is needed for `make install`, use: @@ -379,6 +1094,14 @@ Then re-run the installation instructions (run `make clean` before `make` if nee sudo --preserve-env=PG_CONFIG make install ``` +A few common paths on Mac are: + +- EDB installer - `/Library/PostgreSQL/17/bin/pg_config` +- Homebrew (arm64) - `/opt/homebrew/opt/postgresql@17/bin/pg_config` +- Homebrew (x86-64) - `/usr/local/opt/postgresql@17/bin/pg_config` + +Note: Replace `17` with your Postgres server version + ### Missing Header If compilation fails with `fatal error: postgres.h: No such file or directory`, make sure Postgres development files are installed on the server. @@ -386,41 +1109,53 @@ If compilation fails with `fatal error: postgres.h: No such file or directory`, For Ubuntu and Debian, use: ```sh -sudo apt install postgresql-server-dev-15 +sudo apt install postgresql-server-dev-17 ``` -Note: Replace `15` with your Postgres server version +Note: Replace `17` with your Postgres server version -### Windows +### Missing SDK -Support for Windows is currently experimental. Use `nmake` to build: +If compilation fails and the output includes `warning: no such sysroot directory` on Mac, reinstall Xcode Command Line Tools. -```cmd -set "PGROOT=C:\Program Files\PostgreSQL\15" -git clone --branch v0.4.4 https://github.com/pgvector/pgvector.git -cd pgvector -nmake /F Makefile.win -nmake /F Makefile.win install +### Portability + +By default, pgvector compiles with `-march=native` on some platforms for best performance. However, this can lead to `Illegal instruction` errors if trying to run the compiled extension on a different machine. + +To compile for portability, use: + +```sh +make OPTFLAGS="" ``` +## Installation Notes - Windows + +### Missing Header + +If compilation fails with `Cannot open include file: 'postgres.h': No such file or directory`, make sure `PGROOT` is correct. + +### Permissions + +If installation fails with `Access is denied`, re-run the installation instructions as an administrator. + ## Additional Installation Methods ### Docker -Get the [Docker image](https://hub.docker.com/r/ankane/pgvector) with: +Get the [Docker image](https://hub.docker.com/r/pgvector/pgvector) with: ```sh -docker pull ankane/pgvector +docker pull pgvector/pgvector:pg17 ``` -This adds pgvector to the [Postgres image](https://hub.docker.com/_/postgres) (run it the same way). +This adds pgvector to the [Postgres image](https://hub.docker.com/_/postgres) (replace `17` with your Postgres server version, and run it the same way). You can also build the image manually: ```sh -git clone --branch v0.4.4 https://github.com/pgvector/pgvector.git +git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git cd pgvector -docker build --build-arg PG_MAJOR=15 -t myuser/pgvector . +docker build --pull --build-arg PG_MAJOR=17 -t myuser/pgvector . ``` ### Homebrew @@ -431,7 +1166,7 @@ With Homebrew Postgres, you can use: brew install pgvector ``` -Note: This only adds it to the `postgresql@14` formula +Note: This only adds it to the `postgresql@17` and `postgresql@14` formulas ### PGXN @@ -446,22 +1181,37 @@ pgxn install vector Debian and Ubuntu packages are available from the [PostgreSQL APT Repository](https://wiki.postgresql.org/wiki/Apt). Follow the [setup instructions](https://wiki.postgresql.org/wiki/Apt#Quickstart) and run: ```sh -sudo apt install postgresql-15-pgvector +sudo apt install postgresql-17-pgvector ``` -Note: Replace `15` with your Postgres server version +Note: Replace `17` with your Postgres server version ### Yum RPM packages are available from the [PostgreSQL Yum Repository](https://yum.postgresql.org/). Follow the [setup instructions](https://www.postgresql.org/download/linux/redhat/) for your distribution and run: ```sh -sudo yum install pgvector_15 +sudo yum install pgvector_17 # or -sudo dnf install pgvector_15 +sudo dnf install pgvector_17 +``` + +Note: Replace `17` with your Postgres server version + +### pkg + +Install the FreeBSD package with: + +```sh +pkg install postgresql15-pgvector ``` -Note: Replace `15` with your Postgres server version +or the port with: + +```sh +cd /usr/ports/databases/pgvector +make install +``` ### conda-forge @@ -481,44 +1231,48 @@ Download the [latest release](https://postgresapp.com/downloads.html) with Postg pgvector is available on [these providers](https://github.com/pgvector/pgvector/issues/54). -To request a new extension on other providers: - -- Google Cloud SQL - vote or comment on [this page](https://issuetracker.google.com/issues/265172065) -- DigitalOcean Managed Databases - vote or comment on [this page](https://ideas.digitalocean.com/managed-database/p/pgvector-extension-for-postgresql) -- Heroku Postgres - vote or comment on [this page](https://github.com/heroku/roadmap/issues/156) - ## Upgrading -Install the latest version and run: +[Install](#installation) the latest version (use the same method as the original installation). Then in each database you want to upgrade, run: ```sql ALTER EXTENSION vector UPDATE; ``` +You can check the version in the current database with: + +```sql +SELECT extversion FROM pg_extension WHERE extname = 'vector'; +``` + ## Upgrade Notes -### 0.4.0 +### 0.6.0 + +#### Postgres 12 -If upgrading with Postgres < 13, remove this line from `sql/vector--0.3.2--0.4.0.sql`: +If upgrading with Postgres 12, remove this line from `sql/vector--0.5.1--0.6.0.sql`: ```sql -ALTER TYPE vector SET (STORAGE = extended); +ALTER TYPE vector SET (STORAGE = external); ``` Then run `make install` and `ALTER EXTENSION vector UPDATE;`. -### 0.3.1 +#### Docker -If upgrading from 0.2.7 or 0.3.0, recreate all `ivfflat` indexes after upgrading to ensure all data is indexed. +The Docker image is now published in the `pgvector` org, and there are tags for each supported version of Postgres (rather than a `latest` tag). -```sql --- Postgres 12+ -REINDEX INDEX CONCURRENTLY index_name; +```sh +docker pull pgvector/pgvector:pg16 +# or +docker pull pgvector/pgvector:0.6.0-pg16 +``` --- Postgres < 12 -CREATE INDEX CONCURRENTLY temp_name ON table USING ivfflat (column opclass); -DROP INDEX CONCURRENTLY index_name; -ALTER INDEX temp_name RENAME TO index_name; +Also, if you’ve increased `maintenance_work_mem`, make sure `--shm-size` is at least that size to avoid an error with parallel HNSW index builds. + +```sh +docker run --shm-size=1g ... ``` ## Thanks @@ -527,9 +1281,10 @@ Thanks to: - [PASE: PostgreSQL Ultra-High-Dimensional Approximate Nearest Neighbor Search Extension](https://dl.acm.org/doi/pdf/10.1145/3318464.3386131) - [Faiss: A Library for Efficient Similarity Search and Clustering of Dense Vectors](https://github.com/facebookresearch/faiss) -- [Using the Triangle Inequality to Accelerate k-means](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) +- [Using the Triangle Inequality to Accelerate k-means](https://cdn.aaai.org/ICML/2003/ICML03-022.pdf) - [k-means++: The Advantage of Careful Seeding](https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf) - [Concept Decompositions for Large Sparse Text Data using Clustering](https://www.cs.utexas.edu/users/inderjit/public_papers/concept_mlj.pdf) +- [Efficient and Robust Approximate Nearest Neighbor Search using Hierarchical Navigable Small World Graphs](https://arxiv.org/ftp/arxiv/papers/1603/1603.09320.pdf) ## History @@ -563,18 +1318,36 @@ make prove_installcheck # TAP tests To run single tests: ```sh -make installcheck REGRESS=functions # regression test -make prove_installcheck PROVE_TESTS=test/t/001_wal.pl # TAP test +make installcheck REGRESS=functions # regression test +make prove_installcheck PROVE_TESTS=test/t/001_ivfflat_wal.pl # TAP test +``` + +To enable assertions: + +```sh +make clean && PG_CFLAGS="-DUSE_ASSERT_CHECKING" make && make install ``` To enable benchmarking: ```sh -make clean && PG_CFLAGS=-DIVFFLAT_BENCH make && make install +make clean && PG_CFLAGS="-DIVFFLAT_BENCH" make && make install +``` + +To show memory usage: + +```sh +make clean && PG_CFLAGS="-DHNSW_MEMORY -DIVFFLAT_MEMORY" make && make install +``` + +To get k-means metrics: + +```sh +make clean && PG_CFLAGS="-DIVFFLAT_KMEANS_DEBUG" make && make install ``` Resources for contributors - [Extension Building Infrastructure](https://www.postgresql.org/docs/current/extend-pgxs.html) - [Index Access Method Interface Definition](https://www.postgresql.org/docs/current/indexam.html) -- [Generic WAL Records](https://www.postgresql.org/docs/13/generic-wal.html) +- [Generic WAL Records](https://www.postgresql.org/docs/current/generic-wal.html) diff --git a/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.0--0.4.4-yb-1.1.sql b/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.0--0.4.4-yb-1.1.sql deleted file mode 100644 index 34b01bef5cde..000000000000 --- a/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.0--0.4.4-yb-1.1.sql +++ /dev/null @@ -1,20 +0,0 @@ --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.4-yb-1.1'" to load this file. \quit - -CREATE OPERATOR CLASS vector_ops - DEFAULT FOR TYPE vector USING btree AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 vector_cmp(vector, vector); - -CREATE OPERATOR CLASS vector_ops - DEFAULT FOR TYPE vector USING lsm AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 vector_cmp(vector, vector); diff --git a/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.0.sql b/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.0.sql deleted file mode 100644 index d291850a3b4e..000000000000 --- a/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.0.sql +++ /dev/null @@ -1,210 +0,0 @@ --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "CREATE EXTENSION vector WITH VERSION '0.4.4-yb-1.0'" to load this file. \quit - --- type - -CREATE TYPE vector; - -CREATE FUNCTION vector_in(cstring, oid, integer) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_out(vector) RETURNS cstring - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_typmod_in(cstring[]) RETURNS integer - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_recv(internal, oid, integer) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_send(vector) RETURNS bytea - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE TYPE vector ( - INPUT = vector_in, - OUTPUT = vector_out, - TYPMOD_IN = vector_typmod_in, - RECEIVE = vector_recv, - SEND = vector_send, - STORAGE = extended -); - --- functions - -CREATE FUNCTION l2_distance(vector, vector) RETURNS float8 - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION inner_product(vector, vector) RETURNS float8 - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION cosine_distance(vector, vector) RETURNS float8 - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_dims(vector) RETURNS integer - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_norm(vector) RETURNS float8 - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_add(vector, vector) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_sub(vector, vector) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - --- private functions - -CREATE FUNCTION vector_lt(vector, vector) RETURNS bool - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_le(vector, vector) RETURNS bool - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_eq(vector, vector) RETURNS bool - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_ne(vector, vector) RETURNS bool - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_ge(vector, vector) RETURNS bool - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_gt(vector, vector) RETURNS bool - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_cmp(vector, vector) RETURNS int4 - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_l2_squared_distance(vector, vector) RETURNS float8 - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_negative_inner_product(vector, vector) RETURNS float8 - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_spherical_distance(vector, vector) RETURNS float8 - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_accum(double precision[], vector) RETURNS double precision[] - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_avg(double precision[]) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_combine(double precision[], double precision[]) RETURNS double precision[] - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - --- aggregates - -CREATE AGGREGATE avg(vector) ( - SFUNC = vector_accum, - STYPE = double precision[], - FINALFUNC = vector_avg, - COMBINEFUNC = vector_combine, - INITCOND = '{0}', - PARALLEL = SAFE -); - --- cast functions - -CREATE FUNCTION vector(vector, integer, boolean) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION array_to_vector(integer[], integer, boolean) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION array_to_vector(real[], integer, boolean) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION array_to_vector(double precision[], integer, boolean) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION array_to_vector(numeric[], integer, boolean) RETURNS vector - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - -CREATE FUNCTION vector_to_float4(vector, integer, boolean) RETURNS real[] - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - --- casts - -CREATE CAST (vector AS vector) - WITH FUNCTION vector(vector, integer, boolean) AS IMPLICIT; - -CREATE CAST (vector AS real[]) - WITH FUNCTION vector_to_float4(vector, integer, boolean) AS IMPLICIT; - -CREATE CAST (integer[] AS vector) - WITH FUNCTION array_to_vector(integer[], integer, boolean) AS ASSIGNMENT; - -CREATE CAST (real[] AS vector) - WITH FUNCTION array_to_vector(real[], integer, boolean) AS ASSIGNMENT; - -CREATE CAST (double precision[] AS vector) - WITH FUNCTION array_to_vector(double precision[], integer, boolean) AS ASSIGNMENT; - -CREATE CAST (numeric[] AS vector) - WITH FUNCTION array_to_vector(numeric[], integer, boolean) AS ASSIGNMENT; - --- operators - -CREATE OPERATOR <-> ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = l2_distance, - COMMUTATOR = '<->' -); - -CREATE OPERATOR <#> ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_negative_inner_product, - COMMUTATOR = '<#>' -); - -CREATE OPERATOR <=> ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = cosine_distance, - COMMUTATOR = '<=>' -); - -CREATE OPERATOR + ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_add, - COMMUTATOR = + -); - -CREATE OPERATOR - ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_sub, - COMMUTATOR = - -); - -CREATE OPERATOR < ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_lt, - COMMUTATOR = > , NEGATOR = >= , - RESTRICT = scalarltsel, JOIN = scalarltjoinsel -); - --- should use scalarlesel and scalarlejoinsel, but not supported in Postgres < 11 -CREATE OPERATOR <= ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_le, - COMMUTATOR = >= , NEGATOR = > , - RESTRICT = scalarltsel, JOIN = scalarltjoinsel -); - -CREATE OPERATOR = ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_eq, - COMMUTATOR = = , NEGATOR = <> , - RESTRICT = eqsel, JOIN = eqjoinsel -); - -CREATE OPERATOR <> ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_ne, - COMMUTATOR = <> , NEGATOR = = , - RESTRICT = eqsel, JOIN = eqjoinsel -); - --- should use scalargesel and scalargejoinsel, but not supported in Postgres < 11 -CREATE OPERATOR >= ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_ge, - COMMUTATOR = <= , NEGATOR = < , - RESTRICT = scalargtsel, JOIN = scalargtjoinsel -); - -CREATE OPERATOR > ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_gt, - COMMUTATOR = < , NEGATOR = <= , - RESTRICT = scalargtsel, JOIN = scalargtjoinsel -); diff --git a/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.1--0.4.4-yb-1.2.sql b/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.1--0.4.4-yb-1.2.sql deleted file mode 100644 index 76c73f1e5738..000000000000 --- a/src/postgres/third-party-extensions/pgvector/sql/vector--0.4.4-yb-1.1--0.4.4-yb-1.2.sql +++ /dev/null @@ -1,13 +0,0 @@ -\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.4-yb-1.2'" to load this file. \quit - -CREATE FUNCTION ybdummyannhandler(internal) RETURNS index_am_handler - AS 'MODULE_PATHNAME' LANGUAGE C; - -CREATE ACCESS METHOD ybdummyann TYPE INDEX HANDLER ybdummyannhandler; - -COMMENT ON ACCESS METHOD ybdummyann IS 'ybdummyann index access method'; - -CREATE OPERATOR CLASS vector_l2_ops - DEFAULT FOR TYPE vector USING ybdummyann AS - OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops, - FUNCTION 1 vector_l2_squared_distance(vector, vector); diff --git a/src/postgres/third-party-extensions/pgvector/sql/vector.sql b/src/postgres/third-party-extensions/pgvector/sql/vector.sql index 610118a0aee4..c15b6cb77217 100644 --- a/src/postgres/third-party-extensions/pgvector/sql/vector.sql +++ b/src/postgres/third-party-extensions/pgvector/sql/vector.sql @@ -1,7 +1,7 @@ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION vector" to load this file. \quit --- type +-- vector type SET yb_binary_restore TO true; SELECT binary_upgrade_set_next_pg_type_oid(8078); CREATE TYPE vector; @@ -31,7 +31,7 @@ CREATE TYPE vector ( STORAGE = extended ); --- functions +-- vector functions CREATE FUNCTION l2_distance(vector, vector) RETURNS float8 AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; @@ -42,19 +42,37 @@ CREATE FUNCTION inner_product(vector, vector) RETURNS float8 CREATE FUNCTION cosine_distance(vector, vector) RETURNS float8 AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; +CREATE FUNCTION l1_distance(vector, vector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + CREATE FUNCTION vector_dims(vector) RETURNS integer AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; CREATE FUNCTION vector_norm(vector) RETURNS float8 AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; +CREATE FUNCTION l2_normalize(vector) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION binary_quantize(vector) RETURNS bit + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION subvector(vector, int, int) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- vector private functions + CREATE FUNCTION vector_add(vector, vector) RETURNS vector AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; CREATE FUNCTION vector_sub(vector, vector) RETURNS vector AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; --- private functions +CREATE FUNCTION vector_mul(vector, vector) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_concat(vector, vector) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; CREATE FUNCTION vector_lt(vector, vector) RETURNS bool AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; @@ -95,7 +113,7 @@ CREATE FUNCTION vector_avg(double precision[]) RETURNS vector CREATE FUNCTION vector_combine(double precision[], double precision[]) RETURNS double precision[] AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; --- aggregates +-- vector aggregates CREATE AGGREGATE avg(vector) ( SFUNC = vector_accum, @@ -106,7 +124,14 @@ CREATE AGGREGATE avg(vector) ( PARALLEL = SAFE ); --- cast functions +CREATE AGGREGATE sum(vector) ( + SFUNC = vector_add, + STYPE = vector, + COMBINEFUNC = vector_add, + PARALLEL = SAFE +); + +-- vector cast functions CREATE FUNCTION vector(vector, integer, boolean) RETURNS vector AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; @@ -126,7 +151,7 @@ CREATE FUNCTION array_to_vector(numeric[], integer, boolean) RETURNS vector CREATE FUNCTION vector_to_float4(vector, integer, boolean) RETURNS real[] AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; --- casts +-- vector casts CREATE CAST (vector AS vector) WITH FUNCTION vector(vector, integer, boolean) AS IMPLICIT; @@ -146,7 +171,7 @@ CREATE CAST (double precision[] AS vector) CREATE CAST (numeric[] AS vector) WITH FUNCTION array_to_vector(numeric[], integer, boolean) AS ASSIGNMENT; --- operators +-- vector operators CREATE OPERATOR <-> ( LEFTARG = vector, RIGHTARG = vector, PROCEDURE = l2_distance, @@ -163,14 +188,27 @@ CREATE OPERATOR <=> ( COMMUTATOR = '<=>' ); +CREATE OPERATOR <+> ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = l1_distance, + COMMUTATOR = '<+>' +); + CREATE OPERATOR + ( LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_add, COMMUTATOR = + ); CREATE OPERATOR - ( - LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_sub, - COMMUTATOR = - + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_sub +); + +CREATE OPERATOR * ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_mul, + COMMUTATOR = * +); + +CREATE OPERATOR || ( + LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_concat ); CREATE OPERATOR < ( @@ -179,11 +217,10 @@ CREATE OPERATOR < ( RESTRICT = scalarltsel, JOIN = scalarltjoinsel ); --- should use scalarlesel and scalarlejoinsel, but not supported in Postgres < 11 CREATE OPERATOR <= ( LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_le, COMMUTATOR = >= , NEGATOR = > , - RESTRICT = scalarltsel, JOIN = scalarltjoinsel + RESTRICT = scalarlesel, JOIN = scalarlejoinsel ); CREATE OPERATOR = ( @@ -198,11 +235,10 @@ CREATE OPERATOR <> ( RESTRICT = eqsel, JOIN = eqjoinsel ); --- should use scalargesel and scalargejoinsel, but not supported in Postgres < 11 CREATE OPERATOR >= ( LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_ge, COMMUTATOR = <= , NEGATOR = < , - RESTRICT = scalargtsel, JOIN = scalargtjoinsel + RESTRICT = scalargesel, JOIN = scalargejoinsel ); CREATE OPERATOR > ( @@ -211,7 +247,23 @@ CREATE OPERATOR > ( RESTRICT = scalargtsel, JOIN = scalargtjoinsel ); --- opclasses +-- access methods + +CREATE FUNCTION ybdummyannhandler(internal) RETURNS index_am_handler + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE ACCESS METHOD ybdummyann TYPE INDEX HANDLER ybdummyannhandler; + +COMMENT ON ACCESS METHOD ybdummyann IS 'ybdummyann index access method'; + +CREATE FUNCTION ybhnswhandler(internal) RETURNS index_am_handler + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE ACCESS METHOD ybhnsw TYPE INDEX HANDLER ybhnswhandler; + +COMMENT ON ACCESS METHOD ybhnsw IS 'ybhnsw index access method'; + +-- vector opclasses CREATE OPERATOR CLASS vector_ops DEFAULT FOR TYPE vector USING btree AS @@ -231,25 +283,11 @@ CREATE OPERATOR CLASS vector_ops OPERATOR 5 > , FUNCTION 1 vector_cmp(vector, vector); -CREATE FUNCTION ybdummyannhandler(internal) RETURNS index_am_handler - AS 'MODULE_PATHNAME' LANGUAGE C; - -CREATE ACCESS METHOD ybdummyann TYPE INDEX HANDLER ybdummyannhandler; - -COMMENT ON ACCESS METHOD ybdummyann IS 'ybdummyann index access method'; - CREATE OPERATOR CLASS vector_l2_ops DEFAULT FOR TYPE vector USING ybdummyann AS OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops, FUNCTION 1 vector_l2_squared_distance(vector, vector); -CREATE FUNCTION ybhnswhandler(internal) RETURNS index_am_handler - AS 'MODULE_PATHNAME' LANGUAGE C; - -CREATE ACCESS METHOD ybhnsw TYPE INDEX HANDLER ybhnswhandler; - -COMMENT ON ACCESS METHOD ybhnsw IS 'ybhnsw index access method'; - CREATE OPERATOR CLASS vector_l2_ops DEFAULT FOR TYPE vector USING ybhnsw AS OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops, @@ -265,3 +303,512 @@ CREATE OPERATOR CLASS vector_cosine_ops OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops, FUNCTION 1 vector_negative_inner_product(vector, vector), FUNCTION 2 vector_norm(vector); + +-- halfvec type + +CREATE TYPE halfvec; + +CREATE FUNCTION halfvec_in(cstring, oid, integer) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_out(halfvec) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_recv(internal, oid, integer) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_send(halfvec) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE halfvec ( + INPUT = halfvec_in, + OUTPUT = halfvec_out, + TYPMOD_IN = halfvec_typmod_in, + RECEIVE = halfvec_recv, + SEND = halfvec_send, + STORAGE = extended +); + +-- halfvec functions + +CREATE FUNCTION l2_distance(halfvec, halfvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'halfvec_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(halfvec, halfvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'halfvec_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(halfvec, halfvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'halfvec_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l1_distance(halfvec, halfvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'halfvec_l1_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_dims(halfvec) RETURNS integer + AS 'MODULE_PATHNAME', 'halfvec_vector_dims' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l2_norm(halfvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'halfvec_l2_norm' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l2_normalize(halfvec) RETURNS halfvec + AS 'MODULE_PATHNAME', 'halfvec_l2_normalize' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION binary_quantize(halfvec) RETURNS bit + AS 'MODULE_PATHNAME', 'halfvec_binary_quantize' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION subvector(halfvec, int, int) RETURNS halfvec + AS 'MODULE_PATHNAME', 'halfvec_subvector' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- halfvec private functions + +CREATE FUNCTION halfvec_add(halfvec, halfvec) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_sub(halfvec, halfvec) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_mul(halfvec, halfvec) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_concat(halfvec, halfvec) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_lt(halfvec, halfvec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_le(halfvec, halfvec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_eq(halfvec, halfvec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_ne(halfvec, halfvec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_ge(halfvec, halfvec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_gt(halfvec, halfvec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_cmp(halfvec, halfvec) RETURNS int4 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_l2_squared_distance(halfvec, halfvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_negative_inner_product(halfvec, halfvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_spherical_distance(halfvec, halfvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_accum(double precision[], halfvec) RETURNS double precision[] + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_avg(double precision[]) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_combine(double precision[], double precision[]) RETURNS double precision[] + AS 'MODULE_PATHNAME', 'vector_combine' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- halfvec aggregates + +CREATE AGGREGATE avg(halfvec) ( + SFUNC = halfvec_accum, + STYPE = double precision[], + FINALFUNC = halfvec_avg, + COMBINEFUNC = halfvec_combine, + INITCOND = '{0}', + PARALLEL = SAFE +); + +CREATE AGGREGATE sum(halfvec) ( + SFUNC = halfvec_add, + STYPE = halfvec, + COMBINEFUNC = halfvec_add, + PARALLEL = SAFE +); + +-- halfvec cast functions + +CREATE FUNCTION halfvec(halfvec, integer, boolean) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_to_vector(halfvec, integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_to_halfvec(vector, integer, boolean) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_halfvec(integer[], integer, boolean) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_halfvec(real[], integer, boolean) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_halfvec(double precision[], integer, boolean) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_halfvec(numeric[], integer, boolean) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_to_float4(halfvec, integer, boolean) RETURNS real[] + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- halfvec casts + +CREATE CAST (halfvec AS halfvec) + WITH FUNCTION halfvec(halfvec, integer, boolean) AS IMPLICIT; + +CREATE CAST (halfvec AS vector) + WITH FUNCTION halfvec_to_vector(halfvec, integer, boolean) AS ASSIGNMENT; + +CREATE CAST (vector AS halfvec) + WITH FUNCTION vector_to_halfvec(vector, integer, boolean) AS IMPLICIT; + +CREATE CAST (halfvec AS real[]) + WITH FUNCTION halfvec_to_float4(halfvec, integer, boolean) AS ASSIGNMENT; + +CREATE CAST (integer[] AS halfvec) + WITH FUNCTION array_to_halfvec(integer[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (real[] AS halfvec) + WITH FUNCTION array_to_halfvec(real[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (double precision[] AS halfvec) + WITH FUNCTION array_to_halfvec(double precision[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (numeric[] AS halfvec) + WITH FUNCTION array_to_halfvec(numeric[], integer, boolean) AS ASSIGNMENT; + +-- halfvec operators + +CREATE OPERATOR <-> ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +CREATE OPERATOR <+> ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = l1_distance, + COMMUTATOR = '<+>' +); + +CREATE OPERATOR + ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_add, + COMMUTATOR = + +); + +CREATE OPERATOR - ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_sub +); + +CREATE OPERATOR * ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_mul, + COMMUTATOR = * +); + +CREATE OPERATOR || ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_concat +); + +CREATE OPERATOR < ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_lt, + COMMUTATOR = > , NEGATOR = >= , + RESTRICT = scalarltsel, JOIN = scalarltjoinsel +); + +CREATE OPERATOR <= ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_le, + COMMUTATOR = >= , NEGATOR = > , + RESTRICT = scalarlesel, JOIN = scalarlejoinsel +); + +CREATE OPERATOR = ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_eq, + COMMUTATOR = = , NEGATOR = <> , + RESTRICT = eqsel, JOIN = eqjoinsel +); + +CREATE OPERATOR <> ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_ne, + COMMUTATOR = <> , NEGATOR = = , + RESTRICT = eqsel, JOIN = eqjoinsel +); + +CREATE OPERATOR >= ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_ge, + COMMUTATOR = <= , NEGATOR = < , + RESTRICT = scalargesel, JOIN = scalargejoinsel +); + +CREATE OPERATOR > ( + LEFTARG = halfvec, RIGHTARG = halfvec, PROCEDURE = halfvec_gt, + COMMUTATOR = < , NEGATOR = <= , + RESTRICT = scalargtsel, JOIN = scalargtjoinsel +); + +-- halfvec opclasses + +CREATE OPERATOR CLASS halfvec_ops + DEFAULT FOR TYPE halfvec USING btree AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 halfvec_cmp(halfvec, halfvec); + +CREATE OPERATOR CLASS halfvec_ops + DEFAULT FOR TYPE halfvec USING lsm AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 halfvec_cmp(halfvec, halfvec); + +-- bit functions + +CREATE FUNCTION hamming_distance(bit, bit) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION jaccard_distance(bit, bit) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- bit operators + +CREATE OPERATOR <~> ( + LEFTARG = bit, RIGHTARG = bit, PROCEDURE = hamming_distance, + COMMUTATOR = '<~>' +); + +CREATE OPERATOR <%> ( + LEFTARG = bit, RIGHTARG = bit, PROCEDURE = jaccard_distance, + COMMUTATOR = '<%>' +); + +--- sparsevec type + +CREATE TYPE sparsevec; + +CREATE FUNCTION sparsevec_in(cstring, oid, integer) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_out(sparsevec) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_recv(internal, oid, integer) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_send(sparsevec) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE sparsevec ( + INPUT = sparsevec_in, + OUTPUT = sparsevec_out, + TYPMOD_IN = sparsevec_typmod_in, + RECEIVE = sparsevec_recv, + SEND = sparsevec_send, + STORAGE = extended +); + +-- sparsevec functions + +CREATE FUNCTION l2_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l1_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_l1_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l2_norm(sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_l2_norm' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l2_normalize(sparsevec) RETURNS sparsevec + AS 'MODULE_PATHNAME', 'sparsevec_l2_normalize' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- sparsevec private functions + +CREATE FUNCTION sparsevec_lt(sparsevec, sparsevec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_le(sparsevec, sparsevec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_eq(sparsevec, sparsevec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_ne(sparsevec, sparsevec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_ge(sparsevec, sparsevec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_gt(sparsevec, sparsevec) RETURNS bool + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_cmp(sparsevec, sparsevec) RETURNS int4 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_l2_squared_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_negative_inner_product(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- sparsevec cast functions + +CREATE FUNCTION sparsevec(sparsevec, integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_to_sparsevec(vector, integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_to_vector(sparsevec, integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION halfvec_to_sparsevec(halfvec, integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_to_halfvec(sparsevec, integer, boolean) RETURNS halfvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_sparsevec(integer[], integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_sparsevec(real[], integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_sparsevec(double precision[], integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_sparsevec(numeric[], integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- sparsevec casts + +CREATE CAST (sparsevec AS sparsevec) + WITH FUNCTION sparsevec(sparsevec, integer, boolean) AS IMPLICIT; + +CREATE CAST (sparsevec AS vector) + WITH FUNCTION sparsevec_to_vector(sparsevec, integer, boolean) AS ASSIGNMENT; + +CREATE CAST (vector AS sparsevec) + WITH FUNCTION vector_to_sparsevec(vector, integer, boolean) AS IMPLICIT; + +CREATE CAST (sparsevec AS halfvec) + WITH FUNCTION sparsevec_to_halfvec(sparsevec, integer, boolean) AS ASSIGNMENT; + +CREATE CAST (halfvec AS sparsevec) + WITH FUNCTION halfvec_to_sparsevec(halfvec, integer, boolean) AS IMPLICIT; + +CREATE CAST (integer[] AS sparsevec) + WITH FUNCTION array_to_sparsevec(integer[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (real[] AS sparsevec) + WITH FUNCTION array_to_sparsevec(real[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (double precision[] AS sparsevec) + WITH FUNCTION array_to_sparsevec(double precision[], integer, boolean) AS ASSIGNMENT; + +CREATE CAST (numeric[] AS sparsevec) + WITH FUNCTION array_to_sparsevec(numeric[], integer, boolean) AS ASSIGNMENT; + +-- sparsevec operators + +CREATE OPERATOR <-> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +CREATE OPERATOR <+> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = l1_distance, + COMMUTATOR = '<+>' +); + +CREATE OPERATOR < ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_lt, + COMMUTATOR = > , NEGATOR = >= , + RESTRICT = scalarltsel, JOIN = scalarltjoinsel +); + +CREATE OPERATOR <= ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_le, + COMMUTATOR = >= , NEGATOR = > , + RESTRICT = scalarlesel, JOIN = scalarlejoinsel +); + +CREATE OPERATOR = ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_eq, + COMMUTATOR = = , NEGATOR = <> , + RESTRICT = eqsel, JOIN = eqjoinsel +); + +CREATE OPERATOR <> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_ne, + COMMUTATOR = <> , NEGATOR = = , + RESTRICT = eqsel, JOIN = eqjoinsel +); + +CREATE OPERATOR >= ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_ge, + COMMUTATOR = <= , NEGATOR = < , + RESTRICT = scalargesel, JOIN = scalargejoinsel +); + +CREATE OPERATOR > ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_gt, + COMMUTATOR = < , NEGATOR = <= , + RESTRICT = scalargtsel, JOIN = scalargtjoinsel +); + +-- sparsevec opclasses + +CREATE OPERATOR CLASS sparsevec_ops + DEFAULT FOR TYPE sparsevec USING btree AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 sparsevec_cmp(sparsevec, sparsevec); + +CREATE OPERATOR CLASS sparsevec_ops + DEFAULT FOR TYPE sparsevec USING lsm AS + OPERATOR 1 < , + OPERATOR 2 <= , + OPERATOR 3 = , + OPERATOR 4 >= , + OPERATOR 5 > , + FUNCTION 1 sparsevec_cmp(sparsevec, sparsevec); diff --git a/src/postgres/third-party-extensions/pgvector/src/bitutils.c b/src/postgres/third-party-extensions/pgvector/src/bitutils.c new file mode 100644 index 000000000000..578bcad1de85 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/bitutils.c @@ -0,0 +1,222 @@ +#include "postgres.h" + +#include "bitutils.h" +#include "halfvec.h" /* for USE_DISPATCH and USE_TARGET_CLONES */ +#include "port/pg_bitutils.h" + +#if defined(USE_DISPATCH) +#define BIT_DISPATCH +#endif + +#ifdef BIT_DISPATCH +#include + +#if defined(USE__GET_CPUID) +#include +#else +#include +#endif + +#ifdef _MSC_VER +#define TARGET_AVX512_POPCOUNT +#else +#define TARGET_AVX512_POPCOUNT __attribute__((target("avx512f,avx512vpopcntdq"))) +#endif +#endif + +/* Disable for LLVM due to crash with bitcode generation */ +#if defined(USE_TARGET_CLONES) && !defined(__POPCNT__) && !defined(__llvm__) +#define BIT_TARGET_CLONES __attribute__((target_clones("default", "popcnt"))) +#else +#define BIT_TARGET_CLONES +#endif + +/* Use built-ins when possible for inlining */ +#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE_LONG_INT_64) +#define popcount64(x) __builtin_popcountl(x) +#elif defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE_LONG_LONG_INT_64) +#define popcount64(x) __builtin_popcountll(x) +#elif !defined(_MSC_VER) +/* Fails to resolve with MSVC */ +#define popcount64(x) pg_popcount64(x) +#endif + +uint64 (*BitHammingDistance) (uint32 bytes, unsigned char *ax, unsigned char *bx, uint64 distance); +double (*BitJaccardDistance) (uint32 bytes, unsigned char *ax, unsigned char *bx, uint64 ab, uint64 aa, uint64 bb); + +BIT_TARGET_CLONES static uint64 +BitHammingDistanceDefault(uint32 bytes, unsigned char *ax, unsigned char *bx, uint64 distance) +{ +#ifdef popcount64 + for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) + { + uint64 axs; + uint64 bxs; + + /* Ensure aligned */ + memcpy(&axs, ax, sizeof(uint64)); + memcpy(&bxs, bx, sizeof(uint64)); + + distance += popcount64(axs ^ bxs); + + ax += sizeof(uint64); + bx += sizeof(uint64); + } +#endif + + for (uint32 i = 0; i < bytes; i++) + distance += pg_number_of_ones[ax[i] ^ bx[i]]; + + return distance; +} + +#ifdef BIT_DISPATCH +TARGET_AVX512_POPCOUNT static uint64 +BitHammingDistanceAvx512Popcount(uint32 bytes, unsigned char *ax, unsigned char *bx, uint64 distance) +{ + __m512i dist = _mm512_setzero_si512(); + + for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i)) + { + __m512i axs = _mm512_loadu_si512((const __m512i *) ax); + __m512i bxs = _mm512_loadu_si512((const __m512i *) bx); + + dist = _mm512_add_epi64(dist, _mm512_popcnt_epi64(_mm512_xor_si512(axs, bxs))); + + ax += sizeof(__m512i); + bx += sizeof(__m512i); + } + + distance += _mm512_reduce_add_epi64(dist); + + return BitHammingDistanceDefault(bytes, ax, bx, distance); +} +#endif + +BIT_TARGET_CLONES static double +BitJaccardDistanceDefault(uint32 bytes, unsigned char *ax, unsigned char *bx, uint64 ab, uint64 aa, uint64 bb) +{ +#ifdef popcount64 + for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) + { + uint64 axs; + uint64 bxs; + + /* Ensure aligned */ + memcpy(&axs, ax, sizeof(uint64)); + memcpy(&bxs, bx, sizeof(uint64)); + + ab += popcount64(axs & bxs); + aa += popcount64(axs); + bb += popcount64(bxs); + + ax += sizeof(uint64); + bx += sizeof(uint64); + } +#endif + + for (uint32 i = 0; i < bytes; i++) + { + ab += pg_number_of_ones[ax[i] & bx[i]]; + aa += pg_number_of_ones[ax[i]]; + bb += pg_number_of_ones[bx[i]]; + } + + if (ab == 0) + return 1; + else + return 1 - (ab / ((double) (aa + bb - ab))); +} + +#ifdef BIT_DISPATCH +TARGET_AVX512_POPCOUNT static double +BitJaccardDistanceAvx512Popcount(uint32 bytes, unsigned char *ax, unsigned char *bx, uint64 ab, uint64 aa, uint64 bb) +{ + __m512i abx = _mm512_setzero_si512(); + __m512i aax = _mm512_setzero_si512(); + __m512i bbx = _mm512_setzero_si512(); + + for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i)) + { + __m512i axs = _mm512_loadu_si512((const __m512i *) ax); + __m512i bxs = _mm512_loadu_si512((const __m512i *) bx); + + abx = _mm512_add_epi64(abx, _mm512_popcnt_epi64(_mm512_and_si512(axs, bxs))); + aax = _mm512_add_epi64(aax, _mm512_popcnt_epi64(axs)); + bbx = _mm512_add_epi64(bbx, _mm512_popcnt_epi64(bxs)); + + ax += sizeof(__m512i); + bx += sizeof(__m512i); + } + + ab += _mm512_reduce_add_epi64(abx); + aa += _mm512_reduce_add_epi64(aax); + bb += _mm512_reduce_add_epi64(bbx); + + return BitJaccardDistanceDefault(bytes, ax, bx, ab, aa, bb); +} +#endif + +#ifdef BIT_DISPATCH +#define CPU_FEATURE_OSXSAVE (1 << 27) /* F1 ECX */ +#define CPU_FEATURE_AVX512F (1 << 16) /* F7,0 EBX */ +#define CPU_FEATURE_AVX512VPOPCNTDQ (1 << 14) /* F7,0 ECX */ + +#ifdef _MSC_VER +#define TARGET_XSAVE +#else +#define TARGET_XSAVE __attribute__((target("xsave"))) +#endif + +TARGET_XSAVE static bool +SupportsAvx512Popcount() +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(USE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#else + __cpuid(exx, 1); +#endif + + /* Check OS supports XSAVE */ + if ((exx[2] & CPU_FEATURE_OSXSAVE) != CPU_FEATURE_OSXSAVE) + return false; + + /* Check XMM, YMM, and ZMM registers are enabled */ + if ((_xgetbv(0) & 0xe6) != 0xe6) + return false; + +#if defined(USE__GET_CPUID) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#else + __cpuidex(exx, 7, 0); +#endif + + /* Check AVX512F */ + if ((exx[1] & CPU_FEATURE_AVX512F) != CPU_FEATURE_AVX512F) + return false; + + /* Check AVX512VPOPCNTDQ */ + return (exx[2] & CPU_FEATURE_AVX512VPOPCNTDQ) == CPU_FEATURE_AVX512VPOPCNTDQ; +} +#endif + +void +BitvecInit(void) +{ + /* + * Could skip pointer when single function, but no difference in + * performance + */ + BitHammingDistance = BitHammingDistanceDefault; + BitJaccardDistance = BitJaccardDistanceDefault; + +#ifdef BIT_DISPATCH + if (SupportsAvx512Popcount()) + { + BitHammingDistance = BitHammingDistanceAvx512Popcount; + BitJaccardDistance = BitJaccardDistanceAvx512Popcount; + } +#endif +} diff --git a/src/postgres/third-party-extensions/pgvector/src/bitutils.h b/src/postgres/third-party-extensions/pgvector/src/bitutils.h new file mode 100644 index 000000000000..b4d85bb1c148 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/bitutils.h @@ -0,0 +1,16 @@ +#ifndef BITUTILS_H +#define BITUTILS_H + +#include "postgres.h" + +/* Check version in first header */ +#if PG_VERSION_NUM < 130000 +#error "Requires PostgreSQL 13+" +#endif + +extern uint64 (*BitHammingDistance) (uint32 bytes, unsigned char *ax, unsigned char *bx, uint64 distance); +extern double (*BitJaccardDistance) (uint32 bytes, unsigned char *ax, unsigned char *bx, uint64 ab, uint64 aa, uint64 bb); + +void BitvecInit(void); + +#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/bitvec.c b/src/postgres/third-party-extensions/pgvector/src/bitvec.c new file mode 100644 index 000000000000..094ddd282c19 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/bitvec.c @@ -0,0 +1,69 @@ +#include "postgres.h" + +#include "bitutils.h" +#include "bitvec.h" +#include "utils/varbit.h" +#include "vector.h" + +#if PG_VERSION_NUM >= 160000 +#include "varatt.h" +#endif + +/* + * Allocate and initialize a new bit vector + */ +VarBit * +InitBitVector(int dim) +{ + VarBit *result; + int size; + + size = VARBITTOTALLEN(dim); + result = (VarBit *) palloc0(size); + SET_VARSIZE(result, size); + VARBITLEN(result) = dim; + + return result; +} + +/* + * Ensure same dimensions + */ +static inline void +CheckDims(VarBit *a, VarBit *b) +{ + if (VARBITLEN(a) != VARBITLEN(b)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different bit lengths %u and %u", VARBITLEN(a), VARBITLEN(b)))); +} + +/* + * Get the Hamming distance between two bit vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(hamming_distance); +Datum +hamming_distance(PG_FUNCTION_ARGS) +{ + VarBit *a = PG_GETARG_VARBIT_P(0); + VarBit *b = PG_GETARG_VARBIT_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) BitHammingDistance(VARBITBYTES(a), VARBITS(a), VARBITS(b), 0)); +} + +/* + * Get the Jaccard distance between two bit vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(jaccard_distance); +Datum +jaccard_distance(PG_FUNCTION_ARGS) +{ + VarBit *a = PG_GETARG_VARBIT_P(0); + VarBit *b = PG_GETARG_VARBIT_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(BitJaccardDistance(VARBITBYTES(a), VARBITS(a), VARBITS(b), 0, 0, 0)); +} diff --git a/src/postgres/third-party-extensions/pgvector/src/bitvec.h b/src/postgres/third-party-extensions/pgvector/src/bitvec.h new file mode 100644 index 000000000000..1ff9e55114d5 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/bitvec.h @@ -0,0 +1,8 @@ +#ifndef BITVEC_H +#define BITVEC_H + +#include "utils/varbit.h" + +VarBit *InitBitVector(int dim); + +#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/halfutils.c b/src/postgres/third-party-extensions/pgvector/src/halfutils.c new file mode 100644 index 000000000000..d16909409dfc --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/halfutils.c @@ -0,0 +1,298 @@ +#include "postgres.h" + +#include "halfutils.h" +#include "halfvec.h" + +#ifdef HALFVEC_DISPATCH +#include + +#if defined(USE__GET_CPUID) +#include +#else +#include +#endif + +#ifdef _MSC_VER +#define TARGET_F16C +#else +#define TARGET_F16C __attribute__((target("avx,f16c,fma"))) +#endif +#endif + +float (*HalfvecL2SquaredDistance) (int dim, half * ax, half * bx); +float (*HalfvecInnerProduct) (int dim, half * ax, half * bx); +double (*HalfvecCosineSimilarity) (int dim, half * ax, half * bx); +float (*HalfvecL1Distance) (int dim, half * ax, half * bx); + +static float +HalfvecL2SquaredDistanceDefault(int dim, half * ax, half * bx) +{ + float distance = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < dim; i++) + { + float diff = HalfToFloat4(ax[i]) - HalfToFloat4(bx[i]); + + distance += diff * diff; + } + + return distance; +} + +#ifdef HALFVEC_DISPATCH +TARGET_F16C static float +HalfvecL2SquaredDistanceF16c(int dim, half * ax, half * bx) +{ + float distance; + int i; + float s[8]; + int count = (dim / 8) * 8; + __m256 dist = _mm256_setzero_ps(); + + for (i = 0; i < count; i += 8) + { + __m128i axi = _mm_loadu_si128((__m128i *) (ax + i)); + __m128i bxi = _mm_loadu_si128((__m128i *) (bx + i)); + __m256 axs = _mm256_cvtph_ps(axi); + __m256 bxs = _mm256_cvtph_ps(bxi); + __m256 diff = _mm256_sub_ps(axs, bxs); + + dist = _mm256_fmadd_ps(diff, diff, dist); + } + + _mm256_storeu_ps(s, dist); + + distance = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7]; + + for (; i < dim; i++) + { + float diff = HalfToFloat4(ax[i]) - HalfToFloat4(bx[i]); + + distance += diff * diff; + } + + return distance; +} +#endif + +static float +HalfvecInnerProductDefault(int dim, half * ax, half * bx) +{ + float distance = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < dim; i++) + distance += HalfToFloat4(ax[i]) * HalfToFloat4(bx[i]); + + return distance; +} + +#ifdef HALFVEC_DISPATCH +TARGET_F16C static float +HalfvecInnerProductF16c(int dim, half * ax, half * bx) +{ + float distance; + int i; + float s[8]; + int count = (dim / 8) * 8; + __m256 dist = _mm256_setzero_ps(); + + for (i = 0; i < count; i += 8) + { + __m128i axi = _mm_loadu_si128((__m128i *) (ax + i)); + __m128i bxi = _mm_loadu_si128((__m128i *) (bx + i)); + __m256 axs = _mm256_cvtph_ps(axi); + __m256 bxs = _mm256_cvtph_ps(bxi); + + dist = _mm256_fmadd_ps(axs, bxs, dist); + } + + _mm256_storeu_ps(s, dist); + + distance = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7]; + + for (; i < dim; i++) + distance += HalfToFloat4(ax[i]) * HalfToFloat4(bx[i]); + + return distance; +} +#endif + +static double +HalfvecCosineSimilarityDefault(int dim, half * ax, half * bx) +{ + float similarity = 0.0; + float norma = 0.0; + float normb = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < dim; i++) + { + float axi = HalfToFloat4(ax[i]); + float bxi = HalfToFloat4(bx[i]); + + similarity += axi * bxi; + norma += axi * axi; + normb += bxi * bxi; + } + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + return (double) similarity / sqrt((double) norma * (double) normb); +} + +#ifdef HALFVEC_DISPATCH +TARGET_F16C static double +HalfvecCosineSimilarityF16c(int dim, half * ax, half * bx) +{ + float similarity; + float norma; + float normb; + int i; + float s[8]; + int count = (dim / 8) * 8; + __m256 sim = _mm256_setzero_ps(); + __m256 na = _mm256_setzero_ps(); + __m256 nb = _mm256_setzero_ps(); + + for (i = 0; i < count; i += 8) + { + __m128i axi = _mm_loadu_si128((__m128i *) (ax + i)); + __m128i bxi = _mm_loadu_si128((__m128i *) (bx + i)); + __m256 axs = _mm256_cvtph_ps(axi); + __m256 bxs = _mm256_cvtph_ps(bxi); + + sim = _mm256_fmadd_ps(axs, bxs, sim); + na = _mm256_fmadd_ps(axs, axs, na); + nb = _mm256_fmadd_ps(bxs, bxs, nb); + } + + _mm256_storeu_ps(s, sim); + similarity = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7]; + + _mm256_storeu_ps(s, na); + norma = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7]; + + _mm256_storeu_ps(s, nb); + normb = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7]; + + /* Auto-vectorized */ + for (; i < dim; i++) + { + float axi = HalfToFloat4(ax[i]); + float bxi = HalfToFloat4(bx[i]); + + similarity += axi * bxi; + norma += axi * axi; + normb += bxi * bxi; + } + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + return (double) similarity / sqrt((double) norma * (double) normb); +} +#endif + +static float +HalfvecL1DistanceDefault(int dim, half * ax, half * bx) +{ + float distance = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < dim; i++) + distance += fabsf(HalfToFloat4(ax[i]) - HalfToFloat4(bx[i])); + + return distance; +} + +#ifdef HALFVEC_DISPATCH +/* Does not require FMA, but keep logic simple */ +TARGET_F16C static float +HalfvecL1DistanceF16c(int dim, half * ax, half * bx) +{ + float distance; + int i; + float s[8]; + int count = (dim / 8) * 8; + __m256 dist = _mm256_setzero_ps(); + __m256 sign = _mm256_set1_ps(-0.0); + + for (i = 0; i < count; i += 8) + { + __m128i axi = _mm_loadu_si128((__m128i *) (ax + i)); + __m128i bxi = _mm_loadu_si128((__m128i *) (bx + i)); + __m256 axs = _mm256_cvtph_ps(axi); + __m256 bxs = _mm256_cvtph_ps(bxi); + + dist = _mm256_add_ps(dist, _mm256_andnot_ps(sign, _mm256_sub_ps(axs, bxs))); + } + + _mm256_storeu_ps(s, dist); + + distance = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7]; + + for (; i < dim; i++) + distance += fabsf(HalfToFloat4(ax[i]) - HalfToFloat4(bx[i])); + + return distance; +} +#endif + +#ifdef HALFVEC_DISPATCH +#define CPU_FEATURE_FMA (1 << 12) +#define CPU_FEATURE_OSXSAVE (1 << 27) +#define CPU_FEATURE_AVX (1 << 28) +#define CPU_FEATURE_F16C (1 << 29) + +#ifdef _MSC_VER +#define TARGET_XSAVE +#else +#define TARGET_XSAVE __attribute__((target("xsave"))) +#endif + +TARGET_XSAVE static bool +SupportsCpuFeature(unsigned int feature) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(USE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#else + __cpuid(exx, 1); +#endif + + /* Check OS supports XSAVE */ + if ((exx[2] & CPU_FEATURE_OSXSAVE) != CPU_FEATURE_OSXSAVE) + return false; + + /* Check XMM and YMM registers are enabled */ + if ((_xgetbv(0) & 6) != 6) + return false; + + /* Now check features */ + return (exx[2] & feature) == feature; +} +#endif + +void +HalfvecInit(void) +{ + /* + * Could skip pointer when single function, but no difference in + * performance + */ + HalfvecL2SquaredDistance = HalfvecL2SquaredDistanceDefault; + HalfvecInnerProduct = HalfvecInnerProductDefault; + HalfvecCosineSimilarity = HalfvecCosineSimilarityDefault; + HalfvecL1Distance = HalfvecL1DistanceDefault; + +#ifdef HALFVEC_DISPATCH + if (SupportsCpuFeature(CPU_FEATURE_AVX | CPU_FEATURE_F16C | CPU_FEATURE_FMA)) + { + HalfvecL2SquaredDistance = HalfvecL2SquaredDistanceF16c; + HalfvecInnerProduct = HalfvecInnerProductF16c; + HalfvecCosineSimilarity = HalfvecCosineSimilarityF16c; + /* Does not require FMA, but keep logic simple */ + HalfvecL1Distance = HalfvecL1DistanceF16c; + } +#endif +} diff --git a/src/postgres/third-party-extensions/pgvector/src/halfutils.h b/src/postgres/third-party-extensions/pgvector/src/halfutils.h new file mode 100644 index 000000000000..c684f72d7184 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/halfutils.h @@ -0,0 +1,263 @@ +#ifndef HALFUTILS_H +#define HALFUTILS_H + +#include + +#include "common/shortest_dec.h" +#include "halfvec.h" + +#ifdef F16C_SUPPORT +#include +#endif + +extern float (*HalfvecL2SquaredDistance) (int dim, half * ax, half * bx); +extern float (*HalfvecInnerProduct) (int dim, half * ax, half * bx); +extern double (*HalfvecCosineSimilarity) (int dim, half * ax, half * bx); +extern float (*HalfvecL1Distance) (int dim, half * ax, half * bx); + +void HalfvecInit(void); + +/* + * Check if half is NaN + */ +static inline bool +HalfIsNan(half num) +{ +#ifdef FLT16_SUPPORT + return isnan(num); +#else + return (num & 0x7C00) == 0x7C00 && (num & 0x7FFF) != 0x7C00; +#endif +} + +/* + * Check if half is infinite + */ +static inline bool +HalfIsInf(half num) +{ +#ifdef FLT16_SUPPORT + return isinf(num); +#else + return (num & 0x7FFF) == 0x7C00; +#endif +} + +/* + * Check if half is zero + */ +static inline bool +HalfIsZero(half num) +{ +#ifdef FLT16_SUPPORT + return num == 0; +#else + return (num & 0x7FFF) == 0x0000; +#endif +} + +/* + * Convert a half to a float4 + */ +static inline float +HalfToFloat4(half num) +{ +#if defined(F16C_SUPPORT) + return _cvtsh_ss(num); +#elif defined(FLT16_SUPPORT) + return (float) num; +#else + union + { + float f; + uint32 i; + } swapfloat; + + union + { + half h; + uint16 i; + } swaphalf; + + uint16 bin; + uint32 exponent; + uint32 mantissa; + uint32 result; + + swaphalf.h = num; + bin = swaphalf.i; + exponent = (bin & 0x7C00) >> 10; + mantissa = bin & 0x03FF; + + /* Sign */ + result = (bin & 0x8000) << 16; + + if (unlikely(exponent == 31)) + { + if (mantissa == 0) + { + /* Infinite */ + result |= 0x7F800000; + } + else + { + /* NaN */ + result |= 0x7FC00000; + } + } + else if (unlikely(exponent == 0)) + { + /* Subnormal */ + if (mantissa != 0) + { + exponent = -14; + + for (int i = 0; i < 10; i++) + { + mantissa <<= 1; + exponent -= 1; + + if ((mantissa >> 10) % 2 == 1) + { + mantissa &= 0x03ff; + break; + } + } + + result |= (exponent + 127) << 23; + } + } + else + { + /* Normal */ + result |= (exponent - 15 + 127) << 23; + } + + result |= mantissa << 13; + + swapfloat.i = result; + return swapfloat.f; +#endif +} + +/* + * Convert a float4 to a half + */ +static inline half +Float4ToHalfUnchecked(float num) +{ +#if defined(F16C_SUPPORT) + return _cvtss_sh(num, 0); +#elif defined(FLT16_SUPPORT) + return (_Float16) num; +#else + union + { + float f; + uint32 i; + } swapfloat; + + union + { + half h; + uint16 i; + } swaphalf; + + uint32 bin; + int exponent; + int mantissa; + uint16 result; + + swapfloat.f = num; + bin = swapfloat.i; + exponent = (bin & 0x7F800000) >> 23; + mantissa = bin & 0x007FFFFF; + + /* Sign */ + result = (bin & 0x80000000) >> 16; + + if (isinf(num)) + { + /* Infinite */ + result |= 0x7C00; + } + else if (isnan(num)) + { + /* NaN */ + result |= 0x7E00; + result |= mantissa >> 13; + } + else if (exponent > 98) + { + int m; + int gr; + int s; + + exponent -= 127; + s = mantissa & 0x00000FFF; + + /* Subnormal */ + if (exponent < -14) + { + int diff = -exponent - 14; + + mantissa >>= diff; + mantissa += 1 << (23 - diff); + s |= mantissa & 0x00000FFF; + } + + m = mantissa >> 13; + + /* Round */ + gr = (mantissa >> 12) % 4; + if (gr == 3 || (gr == 1 && s != 0)) + m += 1; + + if (m == 1024) + { + m = 0; + exponent += 1; + } + + if (exponent > 15) + { + /* Infinite */ + result |= 0x7C00; + } + else + { + if (exponent >= -14) + result |= (exponent + 15) << 10; + + result |= m; + } + } + + swaphalf.i = result; + return swaphalf.h; +#endif +} + +/* + * Convert a float4 to a half + */ +static inline half +Float4ToHalf(float num) +{ + half result = Float4ToHalfUnchecked(num); + + if (unlikely(HalfIsInf(result)) && !isinf(num)) + { + char *buf = palloc(FLOAT_SHORTEST_DECIMAL_LEN); + + float_to_shortest_decimal_buf(num, buf); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"%s\" is out of range for type halfvec", buf))); + } + + return result; +} + +#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/halfvec.c b/src/postgres/third-party-extensions/pgvector/src/halfvec.c new file mode 100644 index 000000000000..aad320b1c2ca --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/halfvec.c @@ -0,0 +1,1189 @@ +#include "postgres.h" + +#include + +#include "bitvec.h" +#include "catalog/pg_type.h" +#include "common/shortest_dec.h" +#include "fmgr.h" +#include "halfutils.h" +#include "halfvec.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "port.h" /* for strtof() */ +#include "sparsevec.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/lsyscache.h" +#include "utils/numeric.h" +#include "vector.h" + +#define STATE_DIMS(x) (ARR_DIMS(x)[0] - 1) +#define CreateStateDatums(dim) palloc(sizeof(Datum) * (dim + 1)) + +/* + * Get a half from a message buffer + */ +static half +pq_getmsghalf(StringInfo msg) +{ + union + { + half h; + uint16 i; + } swap; + + swap.i = pq_getmsgint(msg, 2); + return swap.h; +} + +/* + * Append a half to a StringInfo buffer + */ +static void +pq_sendhalf(StringInfo buf, half h) +{ + union + { + half h; + uint16 i; + } swap; + + swap.h = h; + pq_sendint16(buf, swap.i); +} + +/* + * Ensure same dimensions + */ +static inline void +CheckDims(HalfVector * a, HalfVector * b) +{ + if (a->dim != b->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different halfvec dimensions %d and %d", a->dim, b->dim))); +} + +/* + * Ensure expected dimensions + */ +static inline void +CheckExpectedDim(int32 typmod, int dim) +{ + if (typmod != -1 && typmod != dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected %d dimensions, not %d", typmod, dim))); +} + +/* + * Ensure valid dimensions + */ +static inline void +CheckDim(int dim) +{ + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("halfvec must have at least 1 dimension"))); + + if (dim > HALFVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("halfvec cannot have more than %d dimensions", HALFVEC_MAX_DIM))); +} + +/* + * Ensure finite element + */ +static inline void +CheckElement(half value) +{ + if (HalfIsNan(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("NaN not allowed in halfvec"))); + + if (HalfIsInf(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("infinite value not allowed in halfvec"))); +} + +/* + * Allocate and initialize a new half vector + */ +HalfVector * +InitHalfVector(int dim) +{ + HalfVector *result; + int size; + + size = HALFVEC_SIZE(dim); + result = (HalfVector *) palloc0(size); + SET_VARSIZE(result, size); + result->dim = dim; + + return result; +} + +/* + * Check for whitespace, since array_isspace() is static + */ +static inline bool +halfvec_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\v' || + ch == '\f') + return true; + return false; +} + +/* + * Check state array + */ +static float8 * +CheckStateArray(ArrayType *statearray, const char *caller) +{ + if (ARR_NDIM(statearray) != 1 || + ARR_DIMS(statearray)[0] < 1 || + ARR_HASNULL(statearray) || + ARR_ELEMTYPE(statearray) != FLOAT8OID) + elog(ERROR, "%s: expected state array", caller); + return (float8 *) ARR_DATA_PTR(statearray); +} + +/* + * Convert textual representation to internal representation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_in); +Datum +halfvec_in(PG_FUNCTION_ARGS) +{ + char *lit = PG_GETARG_CSTRING(0); + int32 typmod = PG_GETARG_INT32(2); + half x[HALFVEC_MAX_DIM]; + int dim = 0; + char *pt = lit; + HalfVector *result; + + while (halfvec_isspace(*pt)) + pt++; + + if (*pt != '[') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type halfvec: \"%s\"", lit), + errdetail("Vector contents must start with \"[\"."))); + + pt++; + + while (halfvec_isspace(*pt)) + pt++; + + if (*pt == ']') + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("halfvec must have at least 1 dimension"))); + + for (;;) + { + float val; + char *stringEnd; + + if (dim == HALFVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("halfvec cannot have more than %d dimensions", HALFVEC_MAX_DIM))); + + while (halfvec_isspace(*pt)) + pt++; + + /* Check for empty string like float4in */ + if (*pt == '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type halfvec: \"%s\"", lit))); + + errno = 0; + + /* Postgres sets LC_NUMERIC to C on startup */ + val = strtof(pt, &stringEnd); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type halfvec: \"%s\"", lit))); + + x[dim] = Float4ToHalfUnchecked(val); + + /* Check for range error like float4in */ + if ((errno == ERANGE && isinf(val)) || (HalfIsInf(x[dim]) && !isinf(val))) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"%s\" is out of range for type halfvec", pnstrdup(pt, stringEnd - pt)))); + + CheckElement(x[dim]); + dim++; + + pt = stringEnd; + + while (halfvec_isspace(*pt)) + pt++; + + if (*pt == ',') + pt++; + else if (*pt == ']') + { + pt++; + break; + } + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type halfvec: \"%s\"", lit))); + } + + /* Only whitespace is allowed after the closing brace */ + while (halfvec_isspace(*pt)) + pt++; + + if (*pt != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type halfvec: \"%s\"", lit), + errdetail("Junk after closing right brace."))); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + result = InitHalfVector(dim); + for (int i = 0; i < dim; i++) + result->x[i] = x[i]; + + PG_RETURN_POINTER(result); +} + +#define AppendChar(ptr, c) (*(ptr)++ = (c)) +#define AppendFloat(ptr, f) ((ptr) += float_to_shortest_decimal_bufn((f), (ptr))) + +/* + * Convert internal representation to textual representation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_out); +Datum +halfvec_out(PG_FUNCTION_ARGS) +{ + HalfVector *vector = PG_GETARG_HALFVEC_P(0); + int dim = vector->dim; + char *buf; + char *ptr; + + /* + * Need: + * + * dim * (FLOAT_SHORTEST_DECIMAL_LEN - 1) bytes for + * float_to_shortest_decimal_bufn + * + * dim - 1 bytes for separator + * + * 3 bytes for [, ], and \0 + */ + buf = (char *) palloc(FLOAT_SHORTEST_DECIMAL_LEN * dim + 2); + ptr = buf; + + AppendChar(ptr, '['); + + for (int i = 0; i < dim; i++) + { + if (i > 0) + AppendChar(ptr, ','); + + /* + * Use shortest decimal representation of single-precision float for + * simplicity + */ + AppendFloat(ptr, HalfToFloat4(vector->x[i])); + } + + AppendChar(ptr, ']'); + *ptr = '\0'; + + PG_FREE_IF_COPY(vector, 0); + PG_RETURN_CSTRING(buf); +} + +/* + * Convert type modifier + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_typmod_in); +Datum +halfvec_typmod_in(PG_FUNCTION_ARGS) +{ + ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0); + int32 *tl; + int n; + + tl = ArrayGetIntegerTypmods(ta, &n); + + if (n != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid type modifier"))); + + if (*tl < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type halfvec must be at least 1"))); + + if (*tl > HALFVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type halfvec cannot exceed %d", HALFVEC_MAX_DIM))); + + PG_RETURN_INT32(*tl); +} + +/* + * Convert external binary representation to internal representation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_recv); +Datum +halfvec_recv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + int32 typmod = PG_GETARG_INT32(2); + HalfVector *result; + int16 dim; + int16 unused; + + dim = pq_getmsgint(buf, sizeof(int16)); + unused = pq_getmsgint(buf, sizeof(int16)); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + if (unused != 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected unused to be 0, not %d", unused))); + + result = InitHalfVector(dim); + for (int i = 0; i < dim; i++) + { + result->x[i] = pq_getmsghalf(buf); + CheckElement(result->x[i]); + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to the external binary representation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_send); +Datum +halfvec_send(PG_FUNCTION_ARGS) +{ + HalfVector *vec = PG_GETARG_HALFVEC_P(0); + StringInfoData buf; + + pq_begintypsend(&buf); + pq_sendint(&buf, vec->dim, sizeof(int16)); + pq_sendint(&buf, vec->unused, sizeof(int16)); + for (int i = 0; i < vec->dim; i++) + pq_sendhalf(&buf, vec->x[i]); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +/* + * Convert half vector to half vector + * This is needed to check the type modifier + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec); +Datum +halfvec(PG_FUNCTION_ARGS) +{ + HalfVector *vec = PG_GETARG_HALFVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + + CheckExpectedDim(typmod, vec->dim); + + PG_RETURN_POINTER(vec); +} + +/* + * Convert array to half vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(array_to_halfvec); +Datum +array_to_halfvec(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + int32 typmod = PG_GETARG_INT32(1); + HalfVector *result; + int16 typlen; + bool typbyval; + char typalign; + Datum *elemsp; + int nelemsp; + + if (ARR_NDIM(array) > 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("array must be 1-D"))); + + if (ARR_HASNULL(array) && array_contains_nulls(array)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array must not contain nulls"))); + + get_typlenbyvalalign(ARR_ELEMTYPE(array), &typlen, &typbyval, &typalign); + deconstruct_array(array, ARR_ELEMTYPE(array), typlen, typbyval, typalign, &elemsp, NULL, &nelemsp); + + CheckDim(nelemsp); + CheckExpectedDim(typmod, nelemsp); + + result = InitHalfVector(nelemsp); + + if (ARR_ELEMTYPE(array) == INT4OID) + { + for (int i = 0; i < nelemsp; i++) + result->x[i] = Float4ToHalf(DatumGetInt32(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == FLOAT8OID) + { + for (int i = 0; i < nelemsp; i++) + result->x[i] = Float4ToHalf(DatumGetFloat8(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == FLOAT4OID) + { + for (int i = 0; i < nelemsp; i++) + result->x[i] = Float4ToHalf(DatumGetFloat4(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == NUMERICOID) + { + for (int i = 0; i < nelemsp; i++) + result->x[i] = Float4ToHalf(DatumGetFloat4(DirectFunctionCall1(numeric_float4, elemsp[i]))); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("unsupported array type"))); + } + + /* + * Free allocation from deconstruct_array. Do not free individual elements + * when pass-by-reference since they point to original array. + */ + pfree(elemsp); + + /* Check elements */ + for (int i = 0; i < result->dim; i++) + CheckElement(result->x[i]); + + PG_RETURN_POINTER(result); +} + +/* + * Convert half vector to float4[] + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_to_float4); +Datum +halfvec_to_float4(PG_FUNCTION_ARGS) +{ + HalfVector *vec = PG_GETARG_HALFVEC_P(0); + Datum *datums; + ArrayType *result; + + datums = (Datum *) palloc(sizeof(Datum) * vec->dim); + + for (int i = 0; i < vec->dim; i++) + datums[i] = Float4GetDatum(HalfToFloat4(vec->x[i])); + + /* Use TYPALIGN_INT for float4 */ + result = construct_array(datums, vec->dim, FLOAT4OID, sizeof(float4), true, TYPALIGN_INT); + + pfree(datums); + + PG_RETURN_POINTER(result); +} + +/* + * Convert vector to half vec + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_to_halfvec); +Datum +vector_to_halfvec(PG_FUNCTION_ARGS) +{ + Vector *vec = PG_GETARG_VECTOR_P(0); + int32 typmod = PG_GETARG_INT32(1); + HalfVector *result; + + CheckDim(vec->dim); + CheckExpectedDim(typmod, vec->dim); + + result = InitHalfVector(vec->dim); + + for (int i = 0; i < vec->dim; i++) + result->x[i] = Float4ToHalf(vec->x[i]); + + PG_RETURN_POINTER(result); +} + +/* + * Get the L2 distance between half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_l2_distance); +Datum +halfvec_l2_distance(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(sqrt((double) HalfvecL2SquaredDistance(a->dim, a->x, b->x))); +} + +/* + * Get the L2 squared distance between half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_l2_squared_distance); +Datum +halfvec_l2_squared_distance(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) HalfvecL2SquaredDistance(a->dim, a->x, b->x)); +} + +/* + * Get the inner product of two half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_inner_product); +Datum +halfvec_inner_product(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) HalfvecInnerProduct(a->dim, a->x, b->x)); +} + +/* + * Get the negative inner product of two half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_negative_inner_product); +Datum +halfvec_negative_inner_product(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) -HalfvecInnerProduct(a->dim, a->x, b->x)); +} + +/* + * Get the cosine distance between two half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_cosine_distance); +Datum +halfvec_cosine_distance(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + double similarity; + + CheckDims(a, b); + + similarity = HalfvecCosineSimilarity(a->dim, a->x, b->x); + +#ifdef _MSC_VER + /* /fp:fast may not propagate NaN */ + if (isnan(similarity)) + PG_RETURN_FLOAT8(NAN); +#endif + + /* Keep in range */ + if (similarity > 1) + similarity = 1; + else if (similarity < -1) + similarity = -1; + + PG_RETURN_FLOAT8(1 - similarity); +} + +/* + * Get the distance for spherical k-means + * Currently uses angular distance since needs to satisfy triangle inequality + * Assumes inputs are unit vectors (skips norm) + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_spherical_distance); +Datum +halfvec_spherical_distance(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + double distance; + + CheckDims(a, b); + + distance = (double) HalfvecInnerProduct(a->dim, a->x, b->x); + + /* Prevent NaN with acos with loss of precision */ + if (distance > 1) + distance = 1; + else if (distance < -1) + distance = -1; + + PG_RETURN_FLOAT8(acos(distance) / M_PI); +} + +/* + * Get the L1 distance between two half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_l1_distance); +Datum +halfvec_l1_distance(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) HalfvecL1Distance(a->dim, a->x, b->x)); +} + +/* + * Get the dimensions of a half vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_vector_dims); +Datum +halfvec_vector_dims(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + + PG_RETURN_INT32(a->dim); +} + +/* + * Get the L2 norm of a half vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_l2_norm); +Datum +halfvec_l2_norm(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + half *ax = a->x; + double norm = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + double axi = (double) HalfToFloat4(ax[i]); + + norm += axi * axi; + } + + PG_RETURN_FLOAT8(sqrt(norm)); +} + +/* + * Normalize a half vector with the L2 norm + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_l2_normalize); +Datum +halfvec_l2_normalize(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + half *ax = a->x; + double norm = 0; + HalfVector *result; + half *rx; + + result = InitHalfVector(a->dim); + rx = result->x; + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + norm += (double) HalfToFloat4(ax[i]) * (double) HalfToFloat4(ax[i]); + + norm = sqrt(norm); + + /* Return zero vector for zero norm */ + if (norm > 0) + { + for (int i = 0; i < a->dim; i++) + rx[i] = Float4ToHalfUnchecked(HalfToFloat4(ax[i]) / norm); + + /* Check for overflow */ + for (int i = 0; i < a->dim; i++) + { + if (HalfIsInf(rx[i])) + float_overflow_error(); + } + } + + PG_RETURN_POINTER(result); +} + +/* + * Add half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_add); +Datum +halfvec_add(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + half *ax = a->x; + half *bx = b->x; + HalfVector *result; + half *rx; + + CheckDims(a, b); + + result = InitHalfVector(a->dim); + rx = result->x; + + /* Auto-vectorized */ + for (int i = 0, imax = a->dim; i < imax; i++) + { +#ifdef FLT16_SUPPORT + rx[i] = ax[i] + bx[i]; +#else + rx[i] = Float4ToHalfUnchecked(HalfToFloat4(ax[i]) + HalfToFloat4(bx[i])); +#endif + } + + /* Check for overflow */ + for (int i = 0, imax = a->dim; i < imax; i++) + { + if (HalfIsInf(rx[i])) + float_overflow_error(); + } + + PG_RETURN_POINTER(result); +} + +/* + * Subtract half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_sub); +Datum +halfvec_sub(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + half *ax = a->x; + half *bx = b->x; + HalfVector *result; + half *rx; + + CheckDims(a, b); + + result = InitHalfVector(a->dim); + rx = result->x; + + /* Auto-vectorized */ + for (int i = 0, imax = a->dim; i < imax; i++) + { +#ifdef FLT16_SUPPORT + rx[i] = ax[i] - bx[i]; +#else + rx[i] = Float4ToHalfUnchecked(HalfToFloat4(ax[i]) - HalfToFloat4(bx[i])); +#endif + } + + /* Check for overflow */ + for (int i = 0, imax = a->dim; i < imax; i++) + { + if (HalfIsInf(rx[i])) + float_overflow_error(); + } + + PG_RETURN_POINTER(result); +} + +/* + * Multiply half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_mul); +Datum +halfvec_mul(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + half *ax = a->x; + half *bx = b->x; + HalfVector *result; + half *rx; + + CheckDims(a, b); + + result = InitHalfVector(a->dim); + rx = result->x; + + /* Auto-vectorized */ + for (int i = 0, imax = a->dim; i < imax; i++) + { +#ifdef FLT16_SUPPORT + rx[i] = ax[i] * bx[i]; +#else + rx[i] = Float4ToHalfUnchecked(HalfToFloat4(ax[i]) * HalfToFloat4(bx[i])); +#endif + } + + /* Check for overflow and underflow */ + for (int i = 0, imax = a->dim; i < imax; i++) + { + if (HalfIsInf(rx[i])) + float_overflow_error(); + + if (HalfIsZero(rx[i]) && !(HalfIsZero(ax[i]) || HalfIsZero(bx[i]))) + float_underflow_error(); + } + + PG_RETURN_POINTER(result); +} + +/* + * Concatenate half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_concat); +Datum +halfvec_concat(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + HalfVector *result; + int dim = a->dim + b->dim; + + CheckDim(dim); + result = InitHalfVector(dim); + + for (int i = 0; i < a->dim; i++) + result->x[i] = a->x[i]; + + for (int i = 0; i < b->dim; i++) + result->x[i + a->dim] = b->x[i]; + + PG_RETURN_POINTER(result); +} + +/* + * Quantize a half vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_binary_quantize); +Datum +halfvec_binary_quantize(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + half *ax = a->x; + VarBit *result = InitBitVector(a->dim); + unsigned char *rx = VARBITS(result); + + for (int i = 0; i < a->dim; i++) + rx[i / 8] |= (HalfToFloat4(ax[i]) > 0) << (7 - (i % 8)); + + PG_RETURN_VARBIT_P(result); +} + +/* + * Get a subvector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_subvector); +Datum +halfvec_subvector(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + int32 start = PG_GETARG_INT32(1); + int32 count = PG_GETARG_INT32(2); + int32 end; + half *ax = a->x; + HalfVector *result; + int32 dim; + + if (count < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("halfvec must have at least 1 dimension"))); + + /* + * Check if (start + count > a->dim), avoiding integer overflow. a->dim + * and count are both positive, so a->dim - count won't overflow. + */ + if (start > a->dim - count) + end = a->dim + 1; + else + end = start + count; + + /* Indexing starts at 1, like substring */ + if (start < 1) + start = 1; + else if (start > a->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("halfvec must have at least 1 dimension"))); + + dim = end - start; + CheckDim(dim); + result = InitHalfVector(dim); + + for (int i = 0; i < dim; i++) + result->x[i] = ax[start - 1 + i]; + + PG_RETURN_POINTER(result); +} + +/* + * Internal helper to compare half vectors + */ +static int +halfvec_cmp_internal(HalfVector * a, HalfVector * b) +{ + int dim = Min(a->dim, b->dim); + + /* Check values before dimensions to be consistent with Postgres arrays */ + for (int i = 0; i < dim; i++) + { + if (HalfToFloat4(a->x[i]) < HalfToFloat4(b->x[i])) + return -1; + + if (HalfToFloat4(a->x[i]) > HalfToFloat4(b->x[i])) + return 1; + } + + if (a->dim < b->dim) + return -1; + + if (a->dim > b->dim) + return 1; + + return 0; +} + +/* + * Less than + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_lt); +Datum +halfvec_lt(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + PG_RETURN_BOOL(halfvec_cmp_internal(a, b) < 0); +} + +/* + * Less than or equal + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_le); +Datum +halfvec_le(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + PG_RETURN_BOOL(halfvec_cmp_internal(a, b) <= 0); +} + +/* + * Equal + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_eq); +Datum +halfvec_eq(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + PG_RETURN_BOOL(halfvec_cmp_internal(a, b) == 0); +} + +/* + * Not equal + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_ne); +Datum +halfvec_ne(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + PG_RETURN_BOOL(halfvec_cmp_internal(a, b) != 0); +} + +/* + * Greater than or equal + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_ge); +Datum +halfvec_ge(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + PG_RETURN_BOOL(halfvec_cmp_internal(a, b) >= 0); +} + +/* + * Greater than + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_gt); +Datum +halfvec_gt(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + PG_RETURN_BOOL(halfvec_cmp_internal(a, b) > 0); +} + +/* + * Compare half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_cmp); +Datum +halfvec_cmp(PG_FUNCTION_ARGS) +{ + HalfVector *a = PG_GETARG_HALFVEC_P(0); + HalfVector *b = PG_GETARG_HALFVEC_P(1); + + PG_RETURN_INT32(halfvec_cmp_internal(a, b)); +} + +/* + * Accumulate half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_accum); +Datum +halfvec_accum(PG_FUNCTION_ARGS) +{ + ArrayType *statearray = PG_GETARG_ARRAYTYPE_P(0); + HalfVector *newval = PG_GETARG_HALFVEC_P(1); + float8 *statevalues; + int16 dim; + bool newarr; + float8 n; + Datum *statedatums; + half *x = newval->x; + ArrayType *result; + + /* Check array before using */ + statevalues = CheckStateArray(statearray, "halfvec_accum"); + dim = STATE_DIMS(statearray); + newarr = dim == 0; + + if (newarr) + dim = newval->dim; + else + CheckExpectedDim(dim, newval->dim); + + n = statevalues[0] + 1.0; + + statedatums = CreateStateDatums(dim); + statedatums[0] = Float8GetDatum(n); + + if (newarr) + { + for (int i = 0; i < dim; i++) + statedatums[i + 1] = Float8GetDatum((double) HalfToFloat4(x[i])); + } + else + { + for (int i = 0; i < dim; i++) + { + double v = statevalues[i + 1] + (double) HalfToFloat4(x[i]); + + /* Check for overflow */ + if (isinf(v)) + float_overflow_error(); + + statedatums[i + 1] = Float8GetDatum(v); + } + } + + /* Use float8 array like float4_accum */ + result = construct_array(statedatums, dim + 1, + FLOAT8OID, + sizeof(float8), FLOAT8PASSBYVAL, TYPALIGN_DOUBLE); + + pfree(statedatums); + + PG_RETURN_ARRAYTYPE_P(result); +} + +/* + * Average half vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_avg); +Datum +halfvec_avg(PG_FUNCTION_ARGS) +{ + ArrayType *statearray = PG_GETARG_ARRAYTYPE_P(0); + float8 *statevalues; + float8 n; + uint16 dim; + HalfVector *result; + + /* Check array before using */ + statevalues = CheckStateArray(statearray, "halfvec_avg"); + n = statevalues[0]; + + /* SQL defines AVG of no values to be NULL */ + if (n == 0.0) + PG_RETURN_NULL(); + + /* Create half vector */ + dim = STATE_DIMS(statearray); + CheckDim(dim); + result = InitHalfVector(dim); + for (int i = 0; i < dim; i++) + { + result->x[i] = Float4ToHalf(statevalues[i + 1] / n); + CheckElement(result->x[i]); + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert sparse vector to half vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_to_halfvec); +Datum +sparsevec_to_halfvec(PG_FUNCTION_ARGS) +{ + SparseVector *svec = PG_GETARG_SPARSEVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + HalfVector *result; + int dim = svec->dim; + float *values = SPARSEVEC_VALUES(svec); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + result = InitHalfVector(dim); + for (int i = 0; i < svec->nnz; i++) + result->x[svec->indices[i]] = Float4ToHalf(values[i]); + + PG_RETURN_POINTER(result); +} diff --git a/src/postgres/third-party-extensions/pgvector/src/halfvec.h b/src/postgres/third-party-extensions/pgvector/src/halfvec.h new file mode 100644 index 000000000000..a29f1b0e856b --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/halfvec.h @@ -0,0 +1,70 @@ +#ifndef HALFVEC_H +#define HALFVEC_H + +#define __STDC_WANT_IEC_60559_TYPES_EXT__ + +#include + +/* We use two types of dispatching: intrinsics and target_clones */ +/* TODO Move to better place */ +#ifndef DISABLE_DISPATCH +/* Only enable for more recent compilers to keep build process simple */ +#if defined(__x86_64__) && defined(__GNUC__) && __GNUC__ >= 9 +#define USE_DISPATCH +#elif defined(__x86_64__) && defined(__clang_major__) && __clang_major__ >= 7 +#define USE_DISPATCH +#elif defined(_M_AMD64) && defined(_MSC_VER) && _MSC_VER >= 1920 +#define USE_DISPATCH +#endif +#endif + +/* target_clones requires glibc */ +#if defined(USE_DISPATCH) && defined(__gnu_linux__) && defined(__has_attribute) +/* Use separate line for portability */ +#if __has_attribute(target_clones) +#define USE_TARGET_CLONES +#endif +#endif + +/* Apple clang check needed for universal binaries on Mac */ +#if defined(USE_DISPATCH) && (defined(HAVE__GET_CPUID) || defined(__apple_build_version__)) +#define USE__GET_CPUID +#endif + +#if defined(USE_DISPATCH) +#define HALFVEC_DISPATCH +#endif + +/* F16C has better performance than _Float16 (on x86-64) */ +#if defined(__F16C__) +#define F16C_SUPPORT +#elif defined(__FLT16_MAX__) && !defined(HALFVEC_DISPATCH) && !defined(__FreeBSD__) && (!defined(__i386__) || defined(__SSE2__)) +#define FLT16_SUPPORT +#endif + +#ifdef FLT16_SUPPORT +#define half _Float16 +#define HALF_MAX FLT16_MAX +#else +#define half uint16 +#define HALF_MAX 65504 +#endif + +#define HALFVEC_MAX_DIM 16000 + +#define HALFVEC_SIZE(_dim) (offsetof(HalfVector, x) + sizeof(half)*(_dim)) +#define DatumGetHalfVector(x) ((HalfVector *) PG_DETOAST_DATUM(x)) +#define PG_GETARG_HALFVEC_P(x) DatumGetHalfVector(PG_GETARG_DATUM(x)) +#define PG_RETURN_HALFVEC_P(x) PG_RETURN_POINTER(x) + +typedef struct HalfVector +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int16 dim; /* number of dimensions */ + int16 unused; /* reserved for future use, always zero */ + half x[FLEXIBLE_ARRAY_MEMBER]; +} HalfVector; + +HalfVector *InitHalfVector(int dim); + +#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/ivfbuild.c b/src/postgres/third-party-extensions/pgvector/src/ivfbuild.c deleted file mode 100644 index 8badbb9e9c53..000000000000 --- a/src/postgres/third-party-extensions/pgvector/src/ivfbuild.c +++ /dev/null @@ -1,666 +0,0 @@ -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#include "postgres.h" - -#include - -#include "catalog/index.h" -#include "ivfflat.h" -#include "miscadmin.h" -#include "storage/bufmgr.h" -#include "utils/memutils.h" - -#if PG_VERSION_NUM >= 140000 -#include "utils/backend_progress.h" -#elif PG_VERSION_NUM >= 120000 -#include "pgstat.h" -#endif - -#if PG_VERSION_NUM >= 120000 -#include "access/tableam.h" -#include "commands/progress.h" -#else -#define PROGRESS_CREATEIDX_SUBPHASE 0 -#define PROGRESS_CREATEIDX_TUPLES_TOTAL 0 -#define PROGRESS_CREATEIDX_TUPLES_DONE 0 -#endif - -#include "catalog/pg_operator_d.h" -#include "catalog/pg_type_d.h" - -#if PG_VERSION_NUM >= 130000 -#define CALLBACK_ITEM_POINTER ItemPointer tid -#else -#define CALLBACK_ITEM_POINTER HeapTuple hup -#endif - -#if PG_VERSION_NUM >= 120000 -#define UpdateProgress(index, val) pgstat_progress_update_param(index, val) -#else -#define UpdateProgress(index, val) ((void)val) -#endif - -/* - * Add sample - */ -static void -AddSample(Datum *values, IvfflatBuildState * buildstate) -{ - VectorArray samples = buildstate->samples; - int targsamples = samples->maxlen; - - /* Detoast once for all calls */ - Datum value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); - - /* - * Normalize with KMEANS_NORM_PROC since spherical distance function - * expects unit vectors - */ - if (buildstate->kmeansnormprocinfo != NULL) - { - if (!IvfflatNormValue(buildstate->kmeansnormprocinfo, buildstate->collation, &value, buildstate->normvec)) - return; - } - - if (samples->length < targsamples) - { - VectorArraySet(samples, samples->length, DatumGetVector(value)); - samples->length++; - } - else - { - if (buildstate->rowstoskip < 0) - buildstate->rowstoskip = reservoir_get_next_S(&buildstate->rstate, samples->length, targsamples); - - if (buildstate->rowstoskip <= 0) - { -#if PG_VERSION_NUM >= 150000 - int k = (int) (targsamples * sampler_random_fract(&buildstate->rstate.randstate)); -#else - int k = (int) (targsamples * sampler_random_fract(buildstate->rstate.randstate)); -#endif - - Assert(k >= 0 && k < targsamples); - VectorArraySet(samples, k, DatumGetVector(value)); - } - - buildstate->rowstoskip -= 1; - } -} - -/* - * Callback for sampling - */ -static void -SampleCallback(Relation index, CALLBACK_ITEM_POINTER, Datum *values, - bool *isnull, bool tupleIsAlive, void *state) -{ - IvfflatBuildState *buildstate = (IvfflatBuildState *) state; - MemoryContext oldCtx; - - /* Skip nulls */ - if (isnull[0]) - return; - - /* Use memory context since detoast can allocate */ - oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); - - /* Add sample */ - AddSample(values, state); - - /* Reset memory context */ - MemoryContextSwitchTo(oldCtx); - MemoryContextReset(buildstate->tmpCtx); -} - -/* - * Sample rows with same logic as ANALYZE - */ -static void -SampleRows(IvfflatBuildState * buildstate) -{ - int targsamples = buildstate->samples->maxlen; - BlockNumber totalblocks = RelationGetNumberOfBlocks(buildstate->heap); - - buildstate->rowstoskip = -1; - - BlockSampler_Init(&buildstate->bs, totalblocks, targsamples, RandomInt()); - - reservoir_init_selection_state(&buildstate->rstate, targsamples); - while (BlockSampler_HasMore(&buildstate->bs)) - { - BlockNumber targblock = BlockSampler_Next(&buildstate->bs); - -#if PG_VERSION_NUM >= 120000 - table_index_build_range_scan(buildstate->heap, buildstate->index, buildstate->indexInfo, - false, true, false, targblock, 1, SampleCallback, (void *) buildstate, NULL); -#else - IndexBuildHeapRangeScan(buildstate->heap, buildstate->index, buildstate->indexInfo, - false, true, targblock, 1, SampleCallback, (void *) buildstate, NULL); -#endif - } -} - -/* - * Add tuple to sort - */ -static void -AddTupleToSort(Relation index, ItemPointer tid, Datum *values, IvfflatBuildState * buildstate) -{ - double distance; - double minDistance = DBL_MAX; - int closestCenter = 0; - VectorArray centers = buildstate->centers; - TupleTableSlot *slot = buildstate->slot; - int i; - - /* Detoast once for all calls */ - Datum value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); - - /* Normalize if needed */ - if (buildstate->normprocinfo != NULL) - { - if (!IvfflatNormValue(buildstate->normprocinfo, buildstate->collation, &value, buildstate->normvec)) - return; - } - - /* Find the list that minimizes the distance */ - for (i = 0; i < centers->length; i++) - { - distance = DatumGetFloat8(FunctionCall2Coll(buildstate->procinfo, buildstate->collation, value, PointerGetDatum(VectorArrayGet(centers, i)))); - - if (distance < minDistance) - { - minDistance = distance; - closestCenter = i; - } - } - -#ifdef IVFFLAT_KMEANS_DEBUG - buildstate->inertia += minDistance; - buildstate->listSums[closestCenter] += minDistance; - buildstate->listCounts[closestCenter]++; -#endif - - /* Create a virtual tuple */ - ExecClearTuple(slot); - slot->tts_values[0] = Int32GetDatum(closestCenter); - slot->tts_isnull[0] = false; - slot->tts_values[1] = PointerGetDatum(tid); - slot->tts_isnull[1] = false; - slot->tts_values[2] = value; - slot->tts_isnull[2] = false; - ExecStoreVirtualTuple(slot); - - /* - * Add tuple to sort - * - * tuplesort_puttupleslot comment: Input data is always copied; the caller - * need not save it. - */ - tuplesort_puttupleslot(buildstate->sortstate, slot); - - buildstate->indtuples++; -} - -/* - * Callback for table_index_build_scan - */ -static void -BuildCallback(Relation index, CALLBACK_ITEM_POINTER, Datum *values, - bool *isnull, bool tupleIsAlive, void *state) -{ - IvfflatBuildState *buildstate = (IvfflatBuildState *) state; - MemoryContext oldCtx; - -#if PG_VERSION_NUM < 130000 - ItemPointer tid = &hup->t_self; -#endif - - /* Skip nulls */ - if (isnull[0]) - return; - - /* Use memory context since detoast can allocate */ - oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); - - /* Add tuple to sort */ - AddTupleToSort(index, tid, values, buildstate); - - /* Reset memory context */ - MemoryContextSwitchTo(oldCtx); - MemoryContextReset(buildstate->tmpCtx); -} - -/* - * Get index tuple from sort state - */ -static inline void -GetNextTuple(Tuplesortstate *sortstate, TupleDesc tupdesc, TupleTableSlot *slot, IndexTuple *itup, int *list) -{ - Datum value; - bool isnull; - - if (tuplesort_gettupleslot(sortstate, true, false, slot, NULL)) - { - *list = DatumGetInt32(slot_getattr(slot, 1, &isnull)); - value = slot_getattr(slot, 3, &isnull); - - /* Form the index tuple */ - *itup = index_form_tuple(tupdesc, &value, &isnull); - (*itup)->t_tid = *((ItemPointer) DatumGetPointer(slot_getattr(slot, 2, &isnull))); - } - else - *list = -1; -} - -/* - * Create initial entry pages - */ -static void -InsertTuples(Relation index, IvfflatBuildState * buildstate, ForkNumber forkNum) -{ - Buffer buf; - Page page; - GenericXLogState *state; - int list; - IndexTuple itup = NULL; /* silence compiler warning */ - BlockNumber startPage; - BlockNumber insertPage; - Size itemsz; - int i; - int64 inserted = 0; - -#if PG_VERSION_NUM >= 120000 - TupleTableSlot *slot = MakeSingleTupleTableSlot(buildstate->tupdesc, &TTSOpsMinimalTuple); -#else - TupleTableSlot *slot = MakeSingleTupleTableSlot(buildstate->tupdesc); -#endif - TupleDesc tupdesc = RelationGetDescr(index); - - UpdateProgress(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_IVFFLAT_PHASE_LOAD); - - UpdateProgress(PROGRESS_CREATEIDX_TUPLES_TOTAL, buildstate->indtuples); - - GetNextTuple(buildstate->sortstate, tupdesc, slot, &itup, &list); - - for (i = 0; i < buildstate->centers->length; i++) - { - /* Can take a while, so ensure we can interrupt */ - /* Needs to be called when no buffer locks are held */ - CHECK_FOR_INTERRUPTS(); - - buf = IvfflatNewBuffer(index, forkNum); - IvfflatInitRegisterPage(index, &buf, &page, &state); - - startPage = BufferGetBlockNumber(buf); - - /* Get all tuples for list */ - while (list == i) - { - /* Check for free space */ - itemsz = MAXALIGN(IndexTupleSize(itup)); - if (PageGetFreeSpace(page) < itemsz) - IvfflatAppendPage(index, &buf, &page, &state, forkNum); - - /* Add the item */ - if (PageAddItem(page, (Item) itup, itemsz, InvalidOffsetNumber, false, false) == InvalidOffsetNumber) - elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(index)); - - pfree(itup); - - UpdateProgress(PROGRESS_CREATEIDX_TUPLES_DONE, ++inserted); - - GetNextTuple(buildstate->sortstate, tupdesc, slot, &itup, &list); - } - - insertPage = BufferGetBlockNumber(buf); - - IvfflatCommitBuffer(buf, state); - - /* Set the start and insert pages */ - IvfflatUpdateList(index, state, buildstate->listInfo[i], insertPage, InvalidBlockNumber, startPage, forkNum); - } -} - -/* - * Initialize the build state - */ -static void -InitBuildState(IvfflatBuildState * buildstate, Relation heap, Relation index, IndexInfo *indexInfo) -{ - buildstate->heap = heap; - buildstate->index = index; - buildstate->indexInfo = indexInfo; - - buildstate->lists = IvfflatGetLists(index); - buildstate->dimensions = TupleDescAttr(index->rd_att, 0)->atttypmod; - - /* Require column to have dimensions to be indexed */ - if (buildstate->dimensions < 0) - elog(ERROR, "column does not have dimensions"); - - if (buildstate->dimensions > IVFFLAT_MAX_DIM) - elog(ERROR, "column cannot have more than %d dimensions for ivfflat index", IVFFLAT_MAX_DIM); - - buildstate->reltuples = 0; - buildstate->indtuples = 0; - - /* Get support functions */ - buildstate->procinfo = index_getprocinfo(index, 1, IVFFLAT_DISTANCE_PROC); - buildstate->normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_NORM_PROC); - buildstate->kmeansnormprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_KMEANS_NORM_PROC); - buildstate->collation = index->rd_indcollation[0]; - - /* Require more than one dimension for spherical k-means */ - /* Lists check for backwards compatibility */ - /* TODO Remove lists check in 0.3.0 */ - if (buildstate->kmeansnormprocinfo != NULL && buildstate->dimensions == 1 && buildstate->lists > 1) - elog(ERROR, "dimensions must be greater than one for this opclass"); - - /* Create tuple description for sorting */ -#if PG_VERSION_NUM >= 120000 - buildstate->tupdesc = CreateTemplateTupleDesc(3); -#else - buildstate->tupdesc = CreateTemplateTupleDesc(3, false); -#endif - TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 1, "list", INT4OID, -1, 0); - TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 2, "tid", TIDOID, -1, 0); - TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 3, "vector", RelationGetDescr(index)->attrs[0].atttypid, -1, 0); - -#if PG_VERSION_NUM >= 120000 - buildstate->slot = MakeSingleTupleTableSlot(buildstate->tupdesc, &TTSOpsVirtual); -#else - buildstate->slot = MakeSingleTupleTableSlot(buildstate->tupdesc); -#endif - - buildstate->centers = VectorArrayInit(buildstate->lists, buildstate->dimensions); - buildstate->listInfo = palloc(sizeof(ListInfo) * buildstate->lists); - - /* Reuse for each tuple */ - buildstate->normvec = InitVector(buildstate->dimensions); - - buildstate->tmpCtx = AllocSetContextCreate(GetCurrentMemoryContext(), - "Ivfflat build temporary context", - ALLOCSET_DEFAULT_SIZES); - -#ifdef IVFFLAT_KMEANS_DEBUG - buildstate->inertia = 0; - buildstate->listSums = palloc0(sizeof(double) * buildstate->lists); - buildstate->listCounts = palloc0(sizeof(int) * buildstate->lists); -#endif -} - -/* - * Free resources - */ -static void -FreeBuildState(IvfflatBuildState * buildstate) -{ - VectorArrayFree(buildstate->centers); - pfree(buildstate->listInfo); - pfree(buildstate->normvec); - -#ifdef IVFFLAT_KMEANS_DEBUG - pfree(buildstate->listSums); - pfree(buildstate->listCounts); -#endif - - MemoryContextDelete(buildstate->tmpCtx); -} - -/* - * Compute centers - */ -static void -ComputeCenters(IvfflatBuildState * buildstate) -{ - int numSamples; - - UpdateProgress(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_IVFFLAT_PHASE_KMEANS); - - /* Target 50 samples per list, with at least 10000 samples */ - /* The number of samples has a large effect on index build time */ - numSamples = buildstate->lists * 50; - if (numSamples < 10000) - numSamples = 10000; - - /* Skip samples for unlogged table */ - if (buildstate->heap == NULL) - numSamples = 1; - - /* Sample rows */ - /* TODO Ensure within maintenance_work_mem */ - buildstate->samples = VectorArrayInit(numSamples, buildstate->dimensions); - if (buildstate->heap != NULL) - { - SampleRows(buildstate); - - if (buildstate->samples->length < buildstate->lists) - { - ereport(NOTICE, - (errmsg("ivfflat index created with little data"), - errdetail("This will cause low recall."), - errhint("Drop the index until the table has more data."))); - } - } - - /* Calculate centers */ - IvfflatBench("k-means", IvfflatKmeans(buildstate->index, buildstate->samples, buildstate->centers)); - - /* Free samples before we allocate more memory */ - VectorArrayFree(buildstate->samples); -} - -/* - * Create the metapage - */ -static void -CreateMetaPage(Relation index, int dimensions, int lists, ForkNumber forkNum) -{ - Buffer buf; - Page page; - GenericXLogState *state; - IvfflatMetaPage metap; - - buf = IvfflatNewBuffer(index, forkNum); - IvfflatInitRegisterPage(index, &buf, &page, &state); - - /* Set metapage data */ - metap = IvfflatPageGetMeta(page); - metap->magicNumber = IVFFLAT_MAGIC_NUMBER; - metap->version = IVFFLAT_VERSION; - metap->dimensions = dimensions; - metap->lists = lists; - ((PageHeader) page)->pd_lower = - ((char *) metap + sizeof(IvfflatMetaPageData)) - (char *) page; - - IvfflatCommitBuffer(buf, state); -} - -/* - * Create list pages - */ -static void -CreateListPages(Relation index, VectorArray centers, int dimensions, - int lists, ForkNumber forkNum, ListInfo * *listInfo) -{ - int i; - Buffer buf; - Page page; - GenericXLogState *state; - OffsetNumber offno; - Size itemsz; - IvfflatList list; - - itemsz = MAXALIGN(IVFFLAT_LIST_SIZE(dimensions)); - list = palloc(itemsz); - - buf = IvfflatNewBuffer(index, forkNum); - IvfflatInitRegisterPage(index, &buf, &page, &state); - - for (i = 0; i < lists; i++) - { - /* Load list */ - list->startPage = InvalidBlockNumber; - list->insertPage = InvalidBlockNumber; - memcpy(&list->center, VectorArrayGet(centers, i), VECTOR_SIZE(dimensions)); - - /* Ensure free space */ - if (PageGetFreeSpace(page) < itemsz) - IvfflatAppendPage(index, &buf, &page, &state, forkNum); - - /* Add the item */ - offno = PageAddItem(page, (Item) list, itemsz, InvalidOffsetNumber, false, false); - if (offno == InvalidOffsetNumber) - elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(index)); - - /* Save location info */ - (*listInfo)[i].blkno = BufferGetBlockNumber(buf); - (*listInfo)[i].offno = offno; - } - - IvfflatCommitBuffer(buf, state); - - pfree(list); -} - -/* - * Print k-means metrics - */ -#ifdef IVFFLAT_KMEANS_DEBUG -static void -PrintKmeansMetrics(IvfflatBuildState * buildstate) -{ - elog(INFO, "inertia: %.3e", buildstate->inertia); - - /* Calculate Davies-Bouldin index */ - if (buildstate->lists > 1) - { - double db = 0.0; - - /* Calculate average distance */ - for (int i = 0; i < buildstate->lists; i++) - { - if (buildstate->listCounts[i] > 0) - buildstate->listSums[i] /= buildstate->listCounts[i]; - } - - for (int i = 0; i < buildstate->lists; i++) - { - double max = 0.0; - double distance; - - for (int j = 0; j < buildstate->lists; j++) - { - if (j == i) - continue; - - distance = DatumGetFloat8(FunctionCall2Coll(buildstate->procinfo, buildstate->collation, PointerGetDatum(VectorArrayGet(buildstate->centers, i)), PointerGetDatum(VectorArrayGet(buildstate->centers, j)))); - distance = (buildstate->listSums[i] + buildstate->listSums[j]) / distance; - - if (distance > max) - max = distance; - } - db += max; - } - db /= buildstate->lists; - elog(INFO, "davies-bouldin: %.3f", db); - } -} -#endif - -/* - * Scan table for tuples to index - */ -static void -ScanTable(IvfflatBuildState * buildstate) -{ -#if PG_VERSION_NUM >= 120000 - buildstate->reltuples = table_index_build_scan(buildstate->heap, buildstate->index, buildstate->indexInfo, - true, true, BuildCallback, (void *) buildstate, NULL); -#else - buildstate->reltuples = IndexBuildHeapScan(buildstate->heap, buildstate->index, buildstate->indexInfo, - true, BuildCallback, (void *) buildstate, NULL); -#endif -} - -/* - * Create entry pages - */ -static void -CreateEntryPages(IvfflatBuildState * buildstate, ForkNumber forkNum) -{ - AttrNumber attNums[] = {1}; - Oid sortOperators[] = {Int4LessOperator}; - Oid sortCollations[] = {InvalidOid}; - bool nullsFirstFlags[] = {false}; - - UpdateProgress(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_IVFFLAT_PHASE_SORT); - - buildstate->sortstate = tuplesort_begin_heap(buildstate->tupdesc, 1, attNums, sortOperators, sortCollations, nullsFirstFlags, maintenance_work_mem, NULL, false); - - /* Add tuples to sort */ - if (buildstate->heap != NULL) - IvfflatBench("assign tuples", ScanTable(buildstate)); - - /* Sort */ - IvfflatBench("sort tuples", tuplesort_performsort(buildstate->sortstate)); - -#ifdef IVFFLAT_KMEANS_DEBUG - PrintKmeansMetrics(buildstate); -#endif - - /* Insert */ - IvfflatBench("load tuples", InsertTuples(buildstate->index, buildstate, forkNum)); - tuplesort_end(buildstate->sortstate); -} - -/* - * Build the index - */ -static void -BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, - IvfflatBuildState * buildstate, ForkNumber forkNum) -{ - InitBuildState(buildstate, heap, index, indexInfo); - - ComputeCenters(buildstate); - - /* Create pages */ - CreateMetaPage(index, buildstate->dimensions, buildstate->lists, forkNum); - CreateListPages(index, buildstate->centers, buildstate->dimensions, buildstate->lists, forkNum, &buildstate->listInfo); - CreateEntryPages(buildstate, forkNum); - - FreeBuildState(buildstate); -} - -/* - * Build the index for a logged table - */ -IndexBuildResult * -ivfflatbuild(Relation heap, Relation index, IndexInfo *indexInfo) -{ - IndexBuildResult *result; - IvfflatBuildState buildstate; - - BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM); - - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); - result->heap_tuples = buildstate.reltuples; - result->index_tuples = buildstate.indtuples; - - return result; -} - -/* - * Build the index for an unlogged table - */ -void -ivfflatbuildempty(Relation index) -{ - IndexInfo *indexInfo = BuildIndexInfo(index); - IvfflatBuildState buildstate; - - BuildIndex(NULL, index, indexInfo, &buildstate, INIT_FORKNUM); -} -#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/ivfflat.c b/src/postgres/third-party-extensions/pgvector/src/ivfflat.c deleted file mode 100644 index 79e22daf6e73..000000000000 --- a/src/postgres/third-party-extensions/pgvector/src/ivfflat.c +++ /dev/null @@ -1,268 +0,0 @@ -#include "postgres.h" - -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#include - -#include "access/amapi.h" -#include "commands/vacuum.h" -#endif - -#include "ivfflat.h" - -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#include "utils/guc.h" -#include "utils/selfuncs.h" -#include "utils/spccache.h" - -#if PG_VERSION_NUM >= 120000 -#include "commands/progress.h" -#endif - -int ivfflat_probes; -static relopt_kind ivfflat_relopt_kind; -#endif - -/* - * YB: _PG_init has been moved to vector.c. PgVector 0.8.0 has _PG_init - * in vector.c as well. The below code will be removed once PgVector 0.8.0 - * is merged into YB. - */ -#if 0 -/* - * Initialize index options and variables - */ -void -_PG_init(void) -{ -#ifdef YB_IVFFLAT_INDEX_SUPPORT - ivfflat_relopt_kind = add_reloption_kind(); - add_int_reloption(ivfflat_relopt_kind, "lists", "Number of inverted lists", - IVFFLAT_DEFAULT_LISTS, 1, IVFFLAT_MAX_LISTS -#if PG_VERSION_NUM >= 130000 - ,AccessExclusiveLock -#endif - ); - - DefineCustomIntVariable("ivfflat.probes", "Sets the number of probes", - "Valid range is 1..lists.", &ivfflat_probes, - 1, 1, IVFFLAT_MAX_LISTS, PGC_USERSET, 0, NULL, NULL, NULL); -#endif -} -#endif - -#ifdef YB_IVFFLAT_INDEX_SUPPORT -/* - * Get the name of index build phase - */ -#if PG_VERSION_NUM >= 120000 -static char * -ivfflatbuildphasename(int64 phasenum) -{ - switch (phasenum) - { - case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE: - return "initializing"; - case PROGRESS_IVFFLAT_PHASE_KMEANS: - return "performing k-means"; - case PROGRESS_IVFFLAT_PHASE_SORT: - return "sorting tuples"; - case PROGRESS_IVFFLAT_PHASE_LOAD: - return "loading tuples"; - default: - return NULL; - } -} -#endif - -/* - * Estimate the cost of an index scan - */ -static void -ivfflatcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, - Cost *indexStartupCost, Cost *indexTotalCost, - Selectivity *indexSelectivity, double *indexCorrelation, - double *indexPages) -{ - GenericCosts costs; - int lists; - double ratio; - double spc_seq_page_cost; - Relation indexRel; -#if PG_VERSION_NUM < 120000 - List *qinfos; -#endif - - /* Never use index without order */ - if (path->indexorderbys == NULL) - { - *indexStartupCost = DBL_MAX; - *indexTotalCost = DBL_MAX; - *indexSelectivity = 0; - *indexCorrelation = 0; - *indexPages = 0; - return; - } - - MemSet(&costs, 0, sizeof(costs)); - - indexRel = index_open(path->indexinfo->indexoid, NoLock); - lists = IvfflatGetLists(indexRel); - index_close(indexRel, NoLock); - - /* Get the ratio of lists that we need to visit */ - ratio = ((double) ivfflat_probes) / lists; - if (ratio > 1.0) - ratio = 1.0; - - /* - * This gives us the subset of tuples to visit. This value is passed into - * the generic cost estimator to determine the number of pages to visit - * during the index scan. - */ - costs.numIndexTuples = path->indexinfo->tuples * ratio; - -#if PG_VERSION_NUM >= 120000 - genericcostestimate(root, path, loop_count, &costs); -#else - qinfos = deconstruct_indexquals(path); - genericcostestimate(root, path, loop_count, qinfos, &costs); -#endif - - get_tablespace_page_costs(path->indexinfo->reltablespace, NULL, &spc_seq_page_cost); - - /* Adjust cost if needed since TOAST not included in seq scan cost */ - if (costs.numIndexPages > path->indexinfo->rel->pages && ratio < 0.5) - { - /* Change all page cost from random to sequential */ - costs.indexTotalCost -= costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost); - - /* Remove cost of extra pages */ - costs.indexTotalCost -= (costs.numIndexPages - path->indexinfo->rel->pages) * spc_seq_page_cost; - } - else - { - /* Change some page cost from random to sequential */ - costs.indexTotalCost -= 0.5 * costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost); - } - - /* - * If the list selectivity is lower than what is returned from the generic - * cost estimator, use that. - */ - if (ratio < costs.indexSelectivity) - costs.indexSelectivity = ratio; - - /* Use total cost since most work happens before first tuple is returned */ - *indexStartupCost = costs.indexTotalCost; - *indexTotalCost = costs.indexTotalCost; - *indexSelectivity = costs.indexSelectivity; - *indexCorrelation = costs.indexCorrelation; - *indexPages = costs.numIndexPages; -} - -/* - * Parse and validate the reloptions - */ -static bytea * -ivfflatoptions(Datum reloptions, bool validate) -{ - static const relopt_parse_elt tab[] = { - {"lists", RELOPT_TYPE_INT, offsetof(IvfflatOptions, lists)}, - }; - -#if PG_VERSION_NUM >= 130000 - return (bytea *) build_reloptions(reloptions, validate, - ivfflat_relopt_kind, - sizeof(IvfflatOptions), - tab, lengthof(tab)); -#else - relopt_value *options; - int numoptions; - IvfflatOptions *rdopts; - - options = parseRelOptions(reloptions, validate, ivfflat_relopt_kind, &numoptions); - rdopts = allocateReloptStruct(sizeof(IvfflatOptions), options, numoptions); - fillRelOptions((void *) rdopts, sizeof(IvfflatOptions), options, numoptions, - validate, tab, lengthof(tab)); - - return (bytea *) rdopts; -#endif -} - -/* - * Validate catalog entries for the specified operator class - */ -static bool -ivfflatvalidate(Oid opclassoid) -{ - return true; -} - -/* - * Define index handler - * - * See https://www.postgresql.org/docs/current/index-api.html - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(ivfflathandler); -Datum -ivfflathandler(PG_FUNCTION_ARGS) -{ - IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); - - amroutine->amstrategies = 0; - amroutine->amsupport = 4; -#if PG_VERSION_NUM >= 130000 - amroutine->amoptsprocnum = 0; -#endif - amroutine->amcanorder = false; - amroutine->amcanorderbyop = true; - amroutine->amcanbackward = false; /* can change direction mid-scan */ - amroutine->amcanunique = false; - amroutine->amcanmulticol = false; - amroutine->amoptionalkey = true; - amroutine->amsearcharray = false; - amroutine->amsearchnulls = false; - amroutine->amstorage = false; - amroutine->amclusterable = false; - amroutine->ampredlocks = false; - amroutine->amcanparallel = false; - amroutine->amcaninclude = false; -#if PG_VERSION_NUM >= 130000 - amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */ - amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL; -#endif - amroutine->amkeytype = InvalidOid; - - /* Interface functions */ - amroutine->ambuild = ivfflatbuild; - amroutine->ambuildempty = ivfflatbuildempty; - amroutine->aminsert = ivfflatinsert; - amroutine->ambulkdelete = ivfflatbulkdelete; - amroutine->amvacuumcleanup = ivfflatvacuumcleanup; - amroutine->amcanreturn = NULL; /* tuple not included in heapsort */ - amroutine->amcostestimate = ivfflatcostestimate; - amroutine->amoptions = ivfflatoptions; - amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */ -#if PG_VERSION_NUM >= 120000 - amroutine->ambuildphasename = ivfflatbuildphasename; -#endif - amroutine->amvalidate = ivfflatvalidate; -#if PG_VERSION_NUM >= 140000 - amroutine->amadjustmembers = NULL; -#endif - amroutine->ambeginscan = ivfflatbeginscan; - amroutine->amrescan = ivfflatrescan; - amroutine->amgettuple = ivfflatgettuple; - amroutine->amgetbitmap = NULL; - amroutine->amendscan = ivfflatendscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; - - /* Interface functions to support parallel index scans */ - amroutine->amestimateparallelscan = NULL; - amroutine->aminitparallelscan = NULL; - amroutine->amparallelrescan = NULL; - - PG_RETURN_POINTER(amroutine); -} -#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/ivfflat.h b/src/postgres/third-party-extensions/pgvector/src/ivfflat.h deleted file mode 100644 index d5b81047f459..000000000000 --- a/src/postgres/third-party-extensions/pgvector/src/ivfflat.h +++ /dev/null @@ -1,254 +0,0 @@ -#ifndef IVFFLAT_H -#define IVFFLAT_H - -#include "postgres.h" - -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#if PG_VERSION_NUM < 110000 -#error "Requires PostgreSQL 11+" -#endif - -#include "access/generic_xlog.h" -#include "access/reloptions.h" -#include "nodes/execnodes.h" -#include "port.h" /* for strtof() and random() */ -#include "utils/sampling.h" -#include "utils/tuplesort.h" -#include "vector.h" - -#if PG_VERSION_NUM >= 150000 -#include "common/pg_prng.h" -#endif - -#ifdef IVFFLAT_BENCH -#include "portability/instr_time.h" -#endif - -#define IVFFLAT_MAX_DIM 2000 - -/* Support functions */ -#define IVFFLAT_DISTANCE_PROC 1 -#define IVFFLAT_NORM_PROC 2 -#define IVFFLAT_KMEANS_DISTANCE_PROC 3 -#define IVFFLAT_KMEANS_NORM_PROC 4 - -#define IVFFLAT_VERSION 1 -#define IVFFLAT_MAGIC_NUMBER 0x14FF1A7 -#define IVFFLAT_PAGE_ID 0xFF84 - -/* Preserved page numbers */ -#define IVFFLAT_METAPAGE_BLKNO 0 -#define IVFFLAT_HEAD_BLKNO 1 /* first list page */ - -#define IVFFLAT_DEFAULT_LISTS 100 -#define IVFFLAT_MAX_LISTS 32768 - -/* Build phases */ -/* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 */ -#define PROGRESS_IVFFLAT_PHASE_KMEANS 2 -#define PROGRESS_IVFFLAT_PHASE_SORT 3 -#define PROGRESS_IVFFLAT_PHASE_LOAD 4 - -#define IVFFLAT_LIST_SIZE(_dim) (offsetof(IvfflatListData, center) + VECTOR_SIZE(_dim)) - -#define IvfflatPageGetOpaque(page) ((IvfflatPageOpaque) PageGetSpecialPointer(page)) -#define IvfflatPageGetMeta(page) ((IvfflatMetaPageData *) PageGetContents(page)) - -#ifdef IVFFLAT_BENCH -#define IvfflatBench(name, code) \ - do { \ - instr_time start; \ - instr_time duration; \ - INSTR_TIME_SET_CURRENT(start); \ - (code); \ - INSTR_TIME_SET_CURRENT(duration); \ - INSTR_TIME_SUBTRACT(duration, start); \ - elog(INFO, "%s: %.3f ms", name, INSTR_TIME_GET_MILLISEC(duration)); \ - } while (0) -#else -#define IvfflatBench(name, code) (code) -#endif - -#if PG_VERSION_NUM >= 150000 -#define RandomDouble() pg_prng_double(&pg_global_prng_state) -#define RandomInt() pg_prng_uint32(&pg_global_prng_state) -#else -#define RandomDouble() (((double) random()) / MAX_RANDOM_VALUE) -#define RandomInt() random() -#endif - -/* Variables */ -extern int ivfflat_probes; -#endif - -/* Exported functions */ -PGDLLEXPORT void _PG_init(void); - -#ifdef YB_IVFFLAT_INDEX_SUPPORT -typedef struct VectorArrayData -{ - int length; - int maxlen; - int dim; - Vector *items; -} VectorArrayData; - -typedef VectorArrayData * VectorArray; - -typedef struct ListInfo -{ - BlockNumber blkno; - OffsetNumber offno; -} ListInfo; - -/* IVFFlat index options */ -typedef struct IvfflatOptions -{ - int32 vl_len_; /* varlena header (do not touch directly!) */ - int lists; /* number of lists */ -} IvfflatOptions; - -typedef struct IvfflatBuildState -{ - /* Info */ - Relation heap; - Relation index; - IndexInfo *indexInfo; - - /* Settings */ - int dimensions; - int lists; - - /* Statistics */ - double indtuples; - double reltuples; - - /* Support functions */ - FmgrInfo *procinfo; - FmgrInfo *normprocinfo; - FmgrInfo *kmeansnormprocinfo; - Oid collation; - - /* Variables */ - VectorArray samples; - VectorArray centers; - ListInfo *listInfo; - Vector *normvec; - -#ifdef IVFFLAT_KMEANS_DEBUG - double inertia; - double *listSums; - int *listCounts; -#endif - - /* Sampling */ - BlockSamplerData bs; - ReservoirStateData rstate; - int rowstoskip; - - /* Sorting */ - Tuplesortstate *sortstate; - TupleDesc tupdesc; - TupleTableSlot *slot; - - /* Memory */ - MemoryContext tmpCtx; -} IvfflatBuildState; - -typedef struct IvfflatMetaPageData -{ - uint32 magicNumber; - uint32 version; - uint16 dimensions; - uint16 lists; -} IvfflatMetaPageData; - -typedef IvfflatMetaPageData * IvfflatMetaPage; - -typedef struct IvfflatPageOpaqueData -{ - BlockNumber nextblkno; - uint16 unused; - uint16 page_id; /* for identification of IVFFlat indexes */ -} IvfflatPageOpaqueData; - -typedef IvfflatPageOpaqueData * IvfflatPageOpaque; - -typedef struct IvfflatListData -{ - BlockNumber startPage; - BlockNumber insertPage; - Vector center; -} IvfflatListData; - -typedef IvfflatListData * IvfflatList; - -typedef struct IvfflatScanList -{ - pairingheap_node ph_node; - BlockNumber startPage; - double distance; -} IvfflatScanList; - -typedef struct IvfflatScanOpaqueData -{ - int probes; - bool first; - Buffer buf; - - /* Sorting */ - Tuplesortstate *sortstate; - TupleDesc tupdesc; - TupleTableSlot *slot; - bool isnull; - - /* Support functions */ - FmgrInfo *procinfo; - FmgrInfo *normprocinfo; - Oid collation; - - /* Lists */ - pairingheap *listQueue; - IvfflatScanList lists[FLEXIBLE_ARRAY_MEMBER]; /* must come last */ -} IvfflatScanOpaqueData; - -typedef IvfflatScanOpaqueData * IvfflatScanOpaque; - -#define VECTOR_ARRAY_SIZE(_length, _dim) (sizeof(VectorArrayData) + (_length) * VECTOR_SIZE(_dim)) -#define VECTOR_ARRAY_OFFSET(_arr, _offset) ((char*) (_arr)->items + (_offset) * VECTOR_SIZE((_arr)->dim)) -#define VectorArrayGet(_arr, _offset) ((Vector *) VECTOR_ARRAY_OFFSET(_arr, _offset)) -#define VectorArraySet(_arr, _offset, _val) memcpy(VECTOR_ARRAY_OFFSET(_arr, _offset), _val, VECTOR_SIZE((_arr)->dim)) - -/* Methods */ -VectorArray VectorArrayInit(int maxlen, int dimensions); -void VectorArrayFree(VectorArray arr); -void PrintVectorArray(char *msg, VectorArray arr); -void IvfflatKmeans(Relation index, VectorArray samples, VectorArray centers); -FmgrInfo *IvfflatOptionalProcInfo(Relation rel, uint16 procnum); -bool IvfflatNormValue(FmgrInfo *procinfo, Oid collation, Datum *value, Vector * result); -int IvfflatGetLists(Relation index); -void IvfflatUpdateList(Relation index, GenericXLogState *state, ListInfo listInfo, BlockNumber insertPage, BlockNumber originalInsertPage, BlockNumber startPage, ForkNumber forkNum); -void IvfflatCommitBuffer(Buffer buf, GenericXLogState *state); -void IvfflatAppendPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state, ForkNumber forkNum); -Buffer IvfflatNewBuffer(Relation index, ForkNumber forkNum); -void IvfflatInitPage(Buffer buf, Page page); -void IvfflatInitRegisterPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state); - -/* Index access methods */ -IndexBuildResult *ivfflatbuild(Relation heap, Relation index, IndexInfo *indexInfo); -void ivfflatbuildempty(Relation index); -bool ivfflatinsert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, Relation heap, IndexUniqueCheck checkUnique -#if PG_VERSION_NUM >= 140000 - ,bool indexUnchanged -#endif - ,IndexInfo *indexInfo -); -IndexBulkDeleteResult *ivfflatbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state); -IndexBulkDeleteResult *ivfflatvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); -IndexScanDesc ivfflatbeginscan(Relation index, int nkeys, int norderbys); -void ivfflatrescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys); -bool ivfflatgettuple(IndexScanDesc scan, ScanDirection dir); -void ivfflatendscan(IndexScanDesc scan); - -#endif -#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/ivfinsert.c b/src/postgres/third-party-extensions/pgvector/src/ivfinsert.c deleted file mode 100644 index 37d5efa55740..000000000000 --- a/src/postgres/third-party-extensions/pgvector/src/ivfinsert.c +++ /dev/null @@ -1,217 +0,0 @@ -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#include "postgres.h" - -#include - -#include "ivfflat.h" -#include "storage/bufmgr.h" -#include "utils/memutils.h" - -/* - * Find the list that minimizes the distance function - */ -static void -FindInsertPage(Relation rel, Datum *values, BlockNumber *insertPage, ListInfo * listInfo) -{ - Buffer cbuf; - Page cpage; - IvfflatList list; - double distance; - double minDistance = DBL_MAX; - BlockNumber nextblkno = IVFFLAT_HEAD_BLKNO; - FmgrInfo *procinfo; - Oid collation; - OffsetNumber offno; - OffsetNumber maxoffno; - - /* Avoid compiler warning */ - listInfo->blkno = nextblkno; - listInfo->offno = FirstOffsetNumber; - - procinfo = index_getprocinfo(rel, 1, IVFFLAT_DISTANCE_PROC); - collation = rel->rd_indcollation[0]; - - /* Search all list pages */ - while (BlockNumberIsValid(nextblkno)) - { - cbuf = ReadBuffer(rel, nextblkno); - LockBuffer(cbuf, BUFFER_LOCK_SHARE); - cpage = BufferGetPage(cbuf); - maxoffno = PageGetMaxOffsetNumber(cpage); - - for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) - { - list = (IvfflatList) PageGetItem(cpage, PageGetItemId(cpage, offno)); - distance = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, values[0], PointerGetDatum(&list->center))); - - if (distance < minDistance || !BlockNumberIsValid(*insertPage)) - { - *insertPage = list->insertPage; - listInfo->blkno = nextblkno; - listInfo->offno = offno; - minDistance = distance; - } - } - - nextblkno = IvfflatPageGetOpaque(cpage)->nextblkno; - - UnlockReleaseBuffer(cbuf); - } -} - -/* - * Insert a tuple into the index - */ -static void -InsertTuple(Relation rel, Datum *values, bool *isnull, ItemPointer heap_tid, Relation heapRel) -{ - IndexTuple itup; - Datum value; - FmgrInfo *normprocinfo; - Buffer buf; - Page page; - GenericXLogState *state; - Size itemsz; - BlockNumber insertPage = InvalidBlockNumber; - ListInfo listInfo; - BlockNumber originalInsertPage; - - /* Detoast once for all calls */ - value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); - - /* Normalize if needed */ - normprocinfo = IvfflatOptionalProcInfo(rel, IVFFLAT_NORM_PROC); - if (normprocinfo != NULL) - { - if (!IvfflatNormValue(normprocinfo, rel->rd_indcollation[0], &value, NULL)) - return; - } - - /* Find the insert page - sets the page and list info */ - FindInsertPage(rel, values, &insertPage, &listInfo); - Assert(BlockNumberIsValid(insertPage)); - originalInsertPage = insertPage; - - /* Form tuple */ - itup = index_form_tuple(RelationGetDescr(rel), &value, isnull); - itup->t_tid = *heap_tid; - - /* Get tuple size */ - itemsz = MAXALIGN(IndexTupleSize(itup)); - Assert(itemsz <= BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(IvfflatPageOpaqueData))); - - /* Find a page to insert the item */ - for (;;) - { - buf = ReadBuffer(rel, insertPage); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - - state = GenericXLogStart(rel); - page = GenericXLogRegisterBuffer(state, buf, 0); - - if (PageGetFreeSpace(page) >= itemsz) - break; - - insertPage = IvfflatPageGetOpaque(page)->nextblkno; - - if (BlockNumberIsValid(insertPage)) - { - /* Move to next page */ - GenericXLogAbort(state); - UnlockReleaseBuffer(buf); - } - else - { - Buffer metabuf; - Buffer newbuf; - Page newpage; - - /* - * From ReadBufferExtended: Caller is responsible for ensuring - * that only one backend tries to extend a relation at the same - * time! - */ - metabuf = ReadBuffer(rel, IVFFLAT_METAPAGE_BLKNO); - LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); - - /* Add a new page */ - newbuf = IvfflatNewBuffer(rel, MAIN_FORKNUM); - newpage = GenericXLogRegisterBuffer(state, newbuf, GENERIC_XLOG_FULL_IMAGE); - - /* Init new page */ - IvfflatInitPage(newbuf, newpage); - - /* Update insert page */ - insertPage = BufferGetBlockNumber(newbuf); - - /* Update previous buffer */ - IvfflatPageGetOpaque(page)->nextblkno = insertPage; - - /* Commit */ - MarkBufferDirty(newbuf); - MarkBufferDirty(buf); - GenericXLogFinish(state); - - /* Unlock extend relation lock as early as possible */ - UnlockReleaseBuffer(metabuf); - - /* Unlock previous buffer */ - UnlockReleaseBuffer(buf); - - /* Prepare new buffer */ - state = GenericXLogStart(rel); - buf = newbuf; - page = GenericXLogRegisterBuffer(state, buf, 0); - break; - } - } - - /* Add to next offset */ - if (PageAddItem(page, (Item) itup, itemsz, InvalidOffsetNumber, false, false) == InvalidOffsetNumber) - elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); - - IvfflatCommitBuffer(buf, state); - - /* Update the insert page */ - if (insertPage != originalInsertPage) - IvfflatUpdateList(rel, state, listInfo, insertPage, originalInsertPage, InvalidBlockNumber, MAIN_FORKNUM); -} - -/* - * Insert a tuple into the index - */ -bool -ivfflatinsert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, - Relation heap, IndexUniqueCheck checkUnique -#if PG_VERSION_NUM >= 140000 - ,bool indexUnchanged -#endif - ,IndexInfo *indexInfo -) -{ - MemoryContext oldCtx; - MemoryContext insertCtx; - - /* Skip nulls */ - if (isnull[0]) - return false; - - /* - * Use memory context since detoast, IvfflatNormValue, and - * index_form_tuple can allocate - */ - insertCtx = AllocSetContextCreate(GetCurrentMemoryContext(), - "Ivfflat insert temporary context", - ALLOCSET_DEFAULT_SIZES); - oldCtx = MemoryContextSwitchTo(insertCtx); - - /* Insert tuple */ - InsertTuple(index, values, isnull, heap_tid, heap); - - /* Delete memory context */ - MemoryContextSwitchTo(oldCtx); - MemoryContextDelete(insertCtx); - - return false; -} -#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/ivfkmeans.c b/src/postgres/third-party-extensions/pgvector/src/ivfkmeans.c deleted file mode 100644 index ba6ab8751cd3..000000000000 --- a/src/postgres/third-party-extensions/pgvector/src/ivfkmeans.c +++ /dev/null @@ -1,536 +0,0 @@ -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#include "postgres.h" - -#include -#include - -#include "ivfflat.h" -#include "miscadmin.h" - -/* - * Initialize with kmeans++ - * - * https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf - */ -static void -InitCenters(Relation index, VectorArray samples, VectorArray centers, float *lowerBound) -{ - FmgrInfo *procinfo; - Oid collation; - int i; - int64 j; - double distance; - double sum; - double choice; - Vector *vec; - float *weight = palloc(samples->length * sizeof(float)); - int numCenters = centers->maxlen; - int numSamples = samples->length; - - procinfo = index_getprocinfo(index, 1, IVFFLAT_KMEANS_DISTANCE_PROC); - collation = index->rd_indcollation[0]; - - /* Choose an initial center uniformly at random */ - VectorArraySet(centers, 0, VectorArrayGet(samples, RandomInt() % samples->length)); - centers->length++; - - for (j = 0; j < numSamples; j++) - weight[j] = DBL_MAX; - - for (i = 0; i < numCenters; i++) - { - CHECK_FOR_INTERRUPTS(); - - sum = 0.0; - - for (j = 0; j < numSamples; j++) - { - vec = VectorArrayGet(samples, j); - - /* Only need to compute distance for new center */ - /* TODO Use triangle inequality to reduce distance calculations */ - distance = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(vec), PointerGetDatum(VectorArrayGet(centers, i)))); - - /* Set lower bound */ - lowerBound[j * numCenters + i] = distance; - - /* Use distance squared for weighted probability distribution */ - distance *= distance; - - if (distance < weight[j]) - weight[j] = distance; - - sum += weight[j]; - } - - /* Only compute lower bound on last iteration */ - if (i + 1 == numCenters) - break; - - /* Choose new center using weighted probability distribution. */ - choice = sum * RandomDouble(); - for (j = 0; j < numSamples - 1; j++) - { - choice -= weight[j]; - if (choice <= 0) - break; - } - - VectorArraySet(centers, i + 1, VectorArrayGet(samples, j)); - centers->length++; - } - - pfree(weight); -} - -/* - * Apply norm to vector - */ -static inline void -ApplyNorm(FmgrInfo *normprocinfo, Oid collation, Vector * vec) -{ - int i; - double norm = DatumGetFloat8(FunctionCall1Coll(normprocinfo, collation, PointerGetDatum(vec))); - - /* TODO Handle zero norm */ - if (norm > 0) - { - for (i = 0; i < vec->dim; i++) - vec->x[i] /= norm; - } -} - -/* - * Compare vectors - */ -static int -CompareVectors(const void *a, const void *b) -{ - return vector_cmp_internal((Vector *) a, (Vector *) b); -} - -/* - * Quick approach if we have little data - */ -static void -QuickCenters(Relation index, VectorArray samples, VectorArray centers) -{ - int i; - int j; - Vector *vec; - int dimensions = centers->dim; - Oid collation = index->rd_indcollation[0]; - FmgrInfo *normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_KMEANS_NORM_PROC); - - /* Copy existing vectors while avoiding duplicates */ - if (samples->length > 0) - { - qsort(samples->items, samples->length, VECTOR_SIZE(samples->dim), CompareVectors); - for (i = 0; i < samples->length; i++) - { - vec = VectorArrayGet(samples, i); - - if (i == 0 || CompareVectors(vec, VectorArrayGet(samples, i - 1)) != 0) - { - VectorArraySet(centers, centers->length, vec); - centers->length++; - } - } - } - - /* Fill remaining with random data */ - while (centers->length < centers->maxlen) - { - vec = VectorArrayGet(centers, centers->length); - - SET_VARSIZE(vec, VECTOR_SIZE(dimensions)); - vec->dim = dimensions; - - for (j = 0; j < dimensions; j++) - vec->x[j] = RandomDouble(); - - /* Normalize if needed (only needed for random centers) */ - if (normprocinfo != NULL) - ApplyNorm(normprocinfo, collation, vec); - - centers->length++; - } -} - -/* - * Use Elkan for performance. This requires distance function to satisfy triangle inequality. - * - * We use L2 distance for L2 (not L2 squared like index scan) - * and angular distance for inner product and cosine distance - * - * https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf - */ -static void -ElkanKmeans(Relation index, VectorArray samples, VectorArray centers) -{ - FmgrInfo *procinfo; - FmgrInfo *normprocinfo; - Oid collation; - Vector *vec; - Vector *newCenter; - int iteration; - int64 j; - int64 k; - int dimensions = centers->dim; - int numCenters = centers->maxlen; - int numSamples = samples->length; - VectorArray newCenters; - int *centerCounts; - int *closestCenters; - float *lowerBound; - float *upperBound; - float *s; - float *halfcdist; - float *newcdist; - int changes; - double minDistance; - int closestCenter; - double distance; - bool rj; - bool rjreset; - double dxcx; - double dxc; - - /* Calculate allocation sizes */ - Size samplesSize = VECTOR_ARRAY_SIZE(samples->maxlen, samples->dim); - Size centersSize = VECTOR_ARRAY_SIZE(centers->maxlen, centers->dim); - Size newCentersSize = VECTOR_ARRAY_SIZE(numCenters, dimensions); - Size centerCountsSize = sizeof(int) * numCenters; - Size closestCentersSize = sizeof(int) * numSamples; - Size lowerBoundSize = sizeof(float) * numSamples * numCenters; - Size upperBoundSize = sizeof(float) * numSamples; - Size sSize = sizeof(float) * numCenters; - Size halfcdistSize = sizeof(float) * numCenters * numCenters; - Size newcdistSize = sizeof(float) * numCenters; - - /* Calculate total size */ - Size totalSize = samplesSize + centersSize + newCentersSize + centerCountsSize + closestCentersSize + lowerBoundSize + upperBoundSize + sSize + halfcdistSize + newcdistSize; - - /* Check memory requirements */ - /* Add one to error message to ceil */ - if (totalSize > (Size) maintenance_work_mem * 1024L) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("memory required is %zu MB, maintenance_work_mem is %d MB", - totalSize / (1024 * 1024) + 1, maintenance_work_mem / 1024))); - - /* Ensure indexing does not overflow */ - if (numCenters * numCenters > INT_MAX) - elog(ERROR, "Indexing overflow detected. Please report a bug."); - - /* Set support functions */ - procinfo = index_getprocinfo(index, 1, IVFFLAT_KMEANS_DISTANCE_PROC); - normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_KMEANS_NORM_PROC); - collation = index->rd_indcollation[0]; - - /* Allocate space */ - /* Use float instead of double to save memory */ - centerCounts = palloc(centerCountsSize); - closestCenters = palloc(closestCentersSize); - lowerBound = palloc_extended(lowerBoundSize, MCXT_ALLOC_HUGE); - upperBound = palloc(upperBoundSize); - s = palloc(sSize); - halfcdist = palloc_extended(halfcdistSize, MCXT_ALLOC_HUGE); - newcdist = palloc(newcdistSize); - - newCenters = VectorArrayInit(numCenters, dimensions); - for (j = 0; j < numCenters; j++) - { - vec = VectorArrayGet(newCenters, j); - SET_VARSIZE(vec, VECTOR_SIZE(dimensions)); - vec->dim = dimensions; - } - - /* Pick initial centers */ - InitCenters(index, samples, centers, lowerBound); - - /* Assign each x to its closest initial center c(x) = argmin d(x,c) */ - for (j = 0; j < numSamples; j++) - { - minDistance = DBL_MAX; - closestCenter = 0; - - /* Find closest center */ - for (k = 0; k < numCenters; k++) - { - /* TODO Use Lemma 1 in k-means++ initialization */ - distance = lowerBound[j * numCenters + k]; - - if (distance < minDistance) - { - minDistance = distance; - closestCenter = k; - } - } - - upperBound[j] = minDistance; - closestCenters[j] = closestCenter; - } - - /* Give 500 iterations to converge */ - for (iteration = 0; iteration < 500; iteration++) - { - /* Can take a while, so ensure we can interrupt */ - CHECK_FOR_INTERRUPTS(); - - changes = 0; - - /* Step 1: For all centers, compute distance */ - for (j = 0; j < numCenters; j++) - { - vec = VectorArrayGet(centers, j); - - for (k = j + 1; k < numCenters; k++) - { - distance = 0.5 * DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(vec), PointerGetDatum(VectorArrayGet(centers, k)))); - halfcdist[j * numCenters + k] = distance; - halfcdist[k * numCenters + j] = distance; - } - } - - /* For all centers c, compute s(c) */ - for (j = 0; j < numCenters; j++) - { - minDistance = DBL_MAX; - - for (k = 0; k < numCenters; k++) - { - if (j == k) - continue; - - distance = halfcdist[j * numCenters + k]; - if (distance < minDistance) - minDistance = distance; - } - - s[j] = minDistance; - } - - rjreset = iteration != 0; - - for (j = 0; j < numSamples; j++) - { - /* Step 2: Identify all points x such that u(x) <= s(c(x)) */ - if (upperBound[j] <= s[closestCenters[j]]) - continue; - - rj = rjreset; - - for (k = 0; k < numCenters; k++) - { - /* Step 3: For all remaining points x and centers c */ - if (k == closestCenters[j]) - continue; - - if (upperBound[j] <= lowerBound[j * numCenters + k]) - continue; - - if (upperBound[j] <= halfcdist[closestCenters[j] * numCenters + k]) - continue; - - vec = VectorArrayGet(samples, j); - - /* Step 3a */ - if (rj) - { - dxcx = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(vec), PointerGetDatum(VectorArrayGet(centers, closestCenters[j])))); - - /* d(x,c(x)) computed, which is a form of d(x,c) */ - lowerBound[j * numCenters + closestCenters[j]] = dxcx; - upperBound[j] = dxcx; - - rj = false; - } - else - dxcx = upperBound[j]; - - /* Step 3b */ - if (dxcx > lowerBound[j * numCenters + k] || dxcx > halfcdist[closestCenters[j] * numCenters + k]) - { - dxc = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(vec), PointerGetDatum(VectorArrayGet(centers, k)))); - - /* d(x,c) calculated */ - lowerBound[j * numCenters + k] = dxc; - - if (dxc < dxcx) - { - closestCenters[j] = k; - - /* c(x) changed */ - upperBound[j] = dxc; - - changes++; - } - - } - } - } - - /* Step 4: For each center c, let m(c) be mean of all points assigned */ - for (j = 0; j < numCenters; j++) - { - vec = VectorArrayGet(newCenters, j); - for (k = 0; k < dimensions; k++) - vec->x[k] = 0.0; - - centerCounts[j] = 0; - } - - for (j = 0; j < numSamples; j++) - { - vec = VectorArrayGet(samples, j); - closestCenter = closestCenters[j]; - - /* Increment sum and count of closest center */ - newCenter = VectorArrayGet(newCenters, closestCenter); - for (k = 0; k < dimensions; k++) - newCenter->x[k] += vec->x[k]; - - centerCounts[closestCenter] += 1; - } - - for (j = 0; j < numCenters; j++) - { - vec = VectorArrayGet(newCenters, j); - - if (centerCounts[j] > 0) - { - /* Double avoids overflow, but requires more memory */ - /* TODO Update bounds */ - for (k = 0; k < dimensions; k++) - { - if (isinf(vec->x[k])) - vec->x[k] = vec->x[k] > 0 ? FLT_MAX : -FLT_MAX; - } - - for (k = 0; k < dimensions; k++) - vec->x[k] /= centerCounts[j]; - } - else - { - /* TODO Handle empty centers properly */ - for (k = 0; k < dimensions; k++) - vec->x[k] = RandomDouble(); - } - - /* Normalize if needed */ - if (normprocinfo != NULL) - ApplyNorm(normprocinfo, collation, vec); - } - - /* Step 5 */ - for (j = 0; j < numCenters; j++) - newcdist[j] = DatumGetFloat8(FunctionCall2Coll(procinfo, collation, PointerGetDatum(VectorArrayGet(centers, j)), PointerGetDatum(VectorArrayGet(newCenters, j)))); - - for (j = 0; j < numSamples; j++) - { - for (k = 0; k < numCenters; k++) - { - distance = lowerBound[j * numCenters + k] - newcdist[k]; - - if (distance < 0) - distance = 0; - - lowerBound[j * numCenters + k] = distance; - } - } - - /* Step 6 */ - /* We reset r(x) before Step 3 in the next iteration */ - for (j = 0; j < numSamples; j++) - upperBound[j] += newcdist[closestCenters[j]]; - - /* Step 7 */ - for (j = 0; j < numCenters; j++) - memcpy(VectorArrayGet(centers, j), VectorArrayGet(newCenters, j), VECTOR_SIZE(dimensions)); - - if (changes == 0 && iteration != 0) - break; - } - - VectorArrayFree(newCenters); - pfree(centerCounts); - pfree(closestCenters); - pfree(lowerBound); - pfree(upperBound); - pfree(s); - pfree(halfcdist); - pfree(newcdist); -} - -/* - * Detect issues with centers - */ -static void -CheckCenters(Relation index, VectorArray centers) -{ - FmgrInfo *normprocinfo; - Oid collation; - Vector *vec; - int i; - int j; - double norm; - - if (centers->length != centers->maxlen) - elog(ERROR, "Not enough centers. Please report a bug."); - - /* Ensure no NaN or infinite values */ - for (i = 0; i < centers->length; i++) - { - vec = VectorArrayGet(centers, i); - - for (j = 0; j < vec->dim; j++) - { - if (isnan(vec->x[j])) - elog(ERROR, "NaN detected. Please report a bug."); - - if (isinf(vec->x[j])) - elog(ERROR, "Infinite value detected. Please report a bug."); - } - } - - /* Ensure no duplicate centers */ - /* Fine to sort in-place */ - qsort(centers->items, centers->length, VECTOR_SIZE(centers->dim), CompareVectors); - for (i = 1; i < centers->length; i++) - { - if (CompareVectors(VectorArrayGet(centers, i), VectorArrayGet(centers, i - 1)) == 0) - elog(ERROR, "Duplicate centers detected. Please report a bug."); - } - - /* Ensure no zero vectors for cosine distance */ - /* Check NORM_PROC instead of KMEANS_NORM_PROC */ - normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_NORM_PROC); - if (normprocinfo != NULL) - { - collation = index->rd_indcollation[0]; - - for (i = 0; i < centers->length; i++) - { - norm = DatumGetFloat8(FunctionCall1Coll(normprocinfo, collation, PointerGetDatum(VectorArrayGet(centers, i)))); - if (norm == 0) - elog(ERROR, "Zero norm detected. Please report a bug."); - } - } -} - -/* - * Perform naive k-means centering - * We use spherical k-means for inner product and cosine - */ -void -IvfflatKmeans(Relation index, VectorArray samples, VectorArray centers) -{ - if (samples->length <= centers->maxlen) - QuickCenters(index, samples, centers); - else - ElkanKmeans(index, samples, centers); - - CheckCenters(index, centers); -} -#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/ivfscan.c b/src/postgres/third-party-extensions/pgvector/src/ivfscan.c deleted file mode 100644 index ab49624b01a5..000000000000 --- a/src/postgres/third-party-extensions/pgvector/src/ivfscan.c +++ /dev/null @@ -1,364 +0,0 @@ -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#include "postgres.h" - -#include - -#include "access/relscan.h" -#include "ivfflat.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "storage/bufmgr.h" - -#include "catalog/pg_operator_d.h" -#include "catalog/pg_type_d.h" - -/* - * Compare list distances - */ -static int -CompareLists(const pairingheap_node *a, const pairingheap_node *b, void *arg) -{ - if (((const IvfflatScanList *) a)->distance > ((const IvfflatScanList *) b)->distance) - return 1; - - if (((const IvfflatScanList *) a)->distance < ((const IvfflatScanList *) b)->distance) - return -1; - - return 0; -} - -/* - * Get lists and sort by distance - */ -static void -GetScanLists(IndexScanDesc scan, Datum value) -{ - Buffer cbuf; - Page cpage; - IvfflatList list; - OffsetNumber offno; - OffsetNumber maxoffno; - BlockNumber nextblkno = IVFFLAT_HEAD_BLKNO; - int listCount = 0; - IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; - double distance; - IvfflatScanList *scanlist; - double maxDistance = DBL_MAX; - - /* Search all list pages */ - while (BlockNumberIsValid(nextblkno)) - { - cbuf = ReadBuffer(scan->indexRelation, nextblkno); - LockBuffer(cbuf, BUFFER_LOCK_SHARE); - cpage = BufferGetPage(cbuf); - - maxoffno = PageGetMaxOffsetNumber(cpage); - - for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) - { - list = (IvfflatList) PageGetItem(cpage, PageGetItemId(cpage, offno)); - - /* Use procinfo from the index instead of scan key for performance */ - distance = DatumGetFloat8(FunctionCall2Coll(so->procinfo, so->collation, PointerGetDatum(&list->center), value)); - - if (listCount < so->probes) - { - scanlist = &so->lists[listCount]; - scanlist->startPage = list->startPage; - scanlist->distance = distance; - listCount++; - - /* Add to heap */ - pairingheap_add(so->listQueue, &scanlist->ph_node); - - /* Calculate max distance */ - if (listCount == so->probes) - maxDistance = ((IvfflatScanList *) pairingheap_first(so->listQueue))->distance; - } - else if (distance < maxDistance) - { - /* Remove */ - scanlist = (IvfflatScanList *) pairingheap_remove_first(so->listQueue); - - /* Reuse */ - scanlist->startPage = list->startPage; - scanlist->distance = distance; - pairingheap_add(so->listQueue, &scanlist->ph_node); - - /* Update max distance */ - maxDistance = ((IvfflatScanList *) pairingheap_first(so->listQueue))->distance; - } - } - - nextblkno = IvfflatPageGetOpaque(cpage)->nextblkno; - - UnlockReleaseBuffer(cbuf); - } -} - -/* - * Get items - */ -static void -GetScanItems(IndexScanDesc scan, Datum value) -{ - IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; - Buffer buf; - Page page; - IndexTuple itup; - BlockNumber searchPage; - OffsetNumber offno; - OffsetNumber maxoffno; - Datum datum; - bool isnull; - TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); - double tuples = 0; - -#if PG_VERSION_NUM >= 120000 - TupleTableSlot *slot = MakeSingleTupleTableSlot(so->tupdesc, &TTSOpsVirtual); -#else - TupleTableSlot *slot = MakeSingleTupleTableSlot(so->tupdesc); -#endif - - /* - * Reuse same set of shared buffers for scan - * - * See postgres/src/backend/storage/buffer/README for description - */ - BufferAccessStrategy bas = GetAccessStrategy(BAS_BULKREAD); - - /* Search closest probes lists */ - while (!pairingheap_is_empty(so->listQueue)) - { - searchPage = ((IvfflatScanList *) pairingheap_remove_first(so->listQueue))->startPage; - - /* Search all entry pages for list */ - while (BlockNumberIsValid(searchPage)) - { - buf = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, searchPage, RBM_NORMAL, bas); - LockBuffer(buf, BUFFER_LOCK_SHARE); - page = BufferGetPage(buf); - maxoffno = PageGetMaxOffsetNumber(page); - - for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) - { - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); - datum = index_getattr(itup, 1, tupdesc, &isnull); - - /* - * Add virtual tuple - * - * Use procinfo from the index instead of scan key for - * performance - */ - ExecClearTuple(slot); - slot->tts_values[0] = FunctionCall2Coll(so->procinfo, so->collation, datum, value); - slot->tts_isnull[0] = false; - slot->tts_values[1] = PointerGetDatum(&itup->t_tid); - slot->tts_isnull[1] = false; - slot->tts_values[2] = Int32GetDatum((int) searchPage); - slot->tts_isnull[2] = false; - ExecStoreVirtualTuple(slot); - - tuplesort_puttupleslot(so->sortstate, slot); - - tuples++; - } - - searchPage = IvfflatPageGetOpaque(page)->nextblkno; - - UnlockReleaseBuffer(buf); - } - } - - FreeAccessStrategy(bas); - - /* TODO Scan more lists */ - if (tuples < 100) - ereport(DEBUG1, - (errmsg("index scan found few tuples"), - errdetail("Index may have been created with little data."), - errhint("Recreate the index and possibly decrease lists."))); - - tuplesort_performsort(so->sortstate); -} - -/* - * Prepare for an index scan - */ -IndexScanDesc -ivfflatbeginscan(Relation index, int nkeys, int norderbys) -{ - IndexScanDesc scan; - IvfflatScanOpaque so; - int lists; - AttrNumber attNums[] = {1}; - Oid sortOperators[] = {Float8LessOperator}; - Oid sortCollations[] = {InvalidOid}; - bool nullsFirstFlags[] = {false}; - int probes = ivfflat_probes; - - scan = RelationGetIndexScan(index, nkeys, norderbys); - lists = IvfflatGetLists(scan->indexRelation); - - if (probes > lists) - probes = lists; - - so = (IvfflatScanOpaque) palloc(offsetof(IvfflatScanOpaqueData, lists) + probes * sizeof(IvfflatScanList)); - so->buf = InvalidBuffer; - so->first = true; - so->probes = probes; - - /* Set support functions */ - so->procinfo = index_getprocinfo(index, 1, IVFFLAT_DISTANCE_PROC); - so->normprocinfo = IvfflatOptionalProcInfo(index, IVFFLAT_NORM_PROC); - so->collation = index->rd_indcollation[0]; - - /* Create tuple description for sorting */ -#if PG_VERSION_NUM >= 120000 - so->tupdesc = CreateTemplateTupleDesc(3); -#else - so->tupdesc = CreateTemplateTupleDesc(3, false); -#endif - TupleDescInitEntry(so->tupdesc, (AttrNumber) 1, "distance", FLOAT8OID, -1, 0); - TupleDescInitEntry(so->tupdesc, (AttrNumber) 2, "tid", TIDOID, -1, 0); - TupleDescInitEntry(so->tupdesc, (AttrNumber) 3, "indexblkno", INT4OID, -1, 0); - - /* Prep sort */ - so->sortstate = tuplesort_begin_heap(so->tupdesc, 1, attNums, sortOperators, sortCollations, nullsFirstFlags, work_mem, NULL, false); - -#if PG_VERSION_NUM >= 120000 - so->slot = MakeSingleTupleTableSlot(so->tupdesc, &TTSOpsMinimalTuple); -#else - so->slot = MakeSingleTupleTableSlot(so->tupdesc); -#endif - - so->listQueue = pairingheap_allocate(CompareLists, scan); - - scan->opaque = so; - - return scan; -} - -/* - * Start or restart an index scan - */ -void -ivfflatrescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys) -{ - IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; - -#if PG_VERSION_NUM >= 130000 - if (!so->first) - tuplesort_reset(so->sortstate); -#endif - - so->first = true; - pairingheap_reset(so->listQueue); - - if (keys && scan->numberOfKeys > 0) - memmove(scan->keyData, keys, scan->numberOfKeys * sizeof(ScanKeyData)); - - if (orderbys && scan->numberOfOrderBys > 0) - memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); -} - -/* - * Fetch the next tuple in the given scan - */ -bool -ivfflatgettuple(IndexScanDesc scan, ScanDirection dir) -{ - IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; - - /* - * Index can be used to scan backward, but Postgres doesn't support - * backward scan on operators - */ - Assert(ScanDirectionIsForward(dir)); - - if (so->first) - { - Datum value; - - /* Count index scan for stats */ - pgstat_count_index_scan(scan->indexRelation); - - /* Safety check */ - if (scan->orderByData == NULL) - elog(ERROR, "cannot scan ivfflat index without order"); - - /* No items will match if null */ - if (scan->orderByData->sk_flags & SK_ISNULL) - return false; - - value = scan->orderByData->sk_argument; - - /* Value should not be compressed or toasted */ - Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); - Assert(!VARATT_IS_EXTENDED(DatumGetPointer(value))); - - if (so->normprocinfo != NULL) - { - /* No items will match if normalization fails */ - if (!IvfflatNormValue(so->normprocinfo, so->collation, &value, NULL)) - return false; - } - - IvfflatBench("GetScanLists", GetScanLists(scan, value)); - IvfflatBench("GetScanItems", GetScanItems(scan, value)); - so->first = false; - - /* Clean up if we allocated a new value */ - if (value != scan->orderByData->sk_argument) - pfree(DatumGetPointer(value)); - } - - if (tuplesort_gettupleslot(so->sortstate, true, false, so->slot, NULL)) - { - ItemPointer tid = (ItemPointer) DatumGetPointer(slot_getattr(so->slot, 2, &so->isnull)); - BlockNumber indexblkno = DatumGetInt32(slot_getattr(so->slot, 3, &so->isnull)); - -#if PG_VERSION_NUM >= 120000 - scan->xs_heaptid = *tid; -#else - scan->xs_ctup.t_self = *tid; -#endif - - if (BufferIsValid(so->buf)) - ReleaseBuffer(so->buf); - - /* - * An index scan must maintain a pin on the index page holding the - * item last returned by amgettuple - * - * https://www.postgresql.org/docs/current/index-locking.html - */ - so->buf = ReadBuffer(scan->indexRelation, indexblkno); - - scan->xs_recheckorderby = false; - return true; - } - - return false; -} - -/* - * End a scan and release resources - */ -void -ivfflatendscan(IndexScanDesc scan) -{ - IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; - - /* Release pin */ - if (BufferIsValid(so->buf)) - ReleaseBuffer(so->buf); - - pairingheap_free(so->listQueue); - tuplesort_end(so->sortstate); - - pfree(so); - scan->opaque = NULL; -} -#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/ivfutils.c b/src/postgres/third-party-extensions/pgvector/src/ivfutils.c deleted file mode 100644 index f3afaba7ec48..000000000000 --- a/src/postgres/third-party-extensions/pgvector/src/ivfutils.c +++ /dev/null @@ -1,227 +0,0 @@ -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#include "postgres.h" - -#include "ivfflat.h" -#include "storage/bufmgr.h" -#include "vector.h" - -/* - * Allocate a vector array - */ -VectorArray -VectorArrayInit(int maxlen, int dimensions) -{ - VectorArray res = palloc(sizeof(VectorArrayData)); - - res->length = 0; - res->maxlen = maxlen; - res->dim = dimensions; - res->items = palloc_extended(maxlen * VECTOR_SIZE(dimensions), MCXT_ALLOC_ZERO | MCXT_ALLOC_HUGE); - return res; -} - -/* - * Free a vector array - */ -void -VectorArrayFree(VectorArray arr) -{ - pfree(arr->items); - pfree(arr); -} - -/* - * Print vector array - useful for debugging - */ -void -PrintVectorArray(char *msg, VectorArray arr) -{ - int i; - - for (i = 0; i < arr->length; i++) - PrintVector(msg, VectorArrayGet(arr, i)); -} - -/* - * Get the number of lists in the index - */ -int -IvfflatGetLists(Relation index) -{ - IvfflatOptions *opts = (IvfflatOptions *) index->rd_options; - - if (opts) - return opts->lists; - - return IVFFLAT_DEFAULT_LISTS; -} - -/* - * Get proc - */ -FmgrInfo * -IvfflatOptionalProcInfo(Relation rel, uint16 procnum) -{ - if (!OidIsValid(index_getprocid(rel, 1, procnum))) - return NULL; - - return index_getprocinfo(rel, 1, procnum); -} - -/* - * Divide by the norm - * - * Returns false if value should not be indexed - * - * The caller needs to free the pointer stored in value - * if it's different than the original value - */ -bool -IvfflatNormValue(FmgrInfo *procinfo, Oid collation, Datum *value, Vector * result) -{ - Vector *v; - int i; - double norm; - - norm = DatumGetFloat8(FunctionCall1Coll(procinfo, collation, *value)); - - if (norm > 0) - { - v = DatumGetVector(*value); - - if (result == NULL) - result = InitVector(v->dim); - - for (i = 0; i < v->dim; i++) - result->x[i] = v->x[i] / norm; - - *value = PointerGetDatum(result); - - return true; - } - - return false; -} - -/* - * New buffer - */ -Buffer -IvfflatNewBuffer(Relation index, ForkNumber forkNum) -{ - Buffer buf = ReadBufferExtended(index, forkNum, P_NEW, RBM_NORMAL, NULL); - - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - return buf; -} - -/* - * Init page - */ -void -IvfflatInitPage(Buffer buf, Page page) -{ - PageInit(page, BufferGetPageSize(buf), sizeof(IvfflatPageOpaqueData)); - IvfflatPageGetOpaque(page)->nextblkno = InvalidBlockNumber; - IvfflatPageGetOpaque(page)->page_id = IVFFLAT_PAGE_ID; -} - -/* - * Init and register page - */ -void -IvfflatInitRegisterPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state) -{ - *state = GenericXLogStart(index); - *page = GenericXLogRegisterBuffer(*state, *buf, GENERIC_XLOG_FULL_IMAGE); - IvfflatInitPage(*buf, *page); -} - -/* - * Commit buffer - */ -void -IvfflatCommitBuffer(Buffer buf, GenericXLogState *state) -{ - MarkBufferDirty(buf); - GenericXLogFinish(state); - UnlockReleaseBuffer(buf); -} - -/* - * Add a new page - * - * The order is very important!! - */ -void -IvfflatAppendPage(Relation index, Buffer *buf, Page *page, GenericXLogState **state, ForkNumber forkNum) -{ - /* Get new buffer */ - Buffer newbuf = IvfflatNewBuffer(index, forkNum); - Page newpage = GenericXLogRegisterBuffer(*state, newbuf, GENERIC_XLOG_FULL_IMAGE); - - /* Update the previous buffer */ - IvfflatPageGetOpaque(*page)->nextblkno = BufferGetBlockNumber(newbuf); - - /* Init new page */ - IvfflatInitPage(newbuf, newpage); - - /* Commit */ - MarkBufferDirty(*buf); - MarkBufferDirty(newbuf); - GenericXLogFinish(*state); - - /* Unlock */ - UnlockReleaseBuffer(*buf); - - *state = GenericXLogStart(index); - *page = GenericXLogRegisterBuffer(*state, newbuf, GENERIC_XLOG_FULL_IMAGE); - *buf = newbuf; -} - -/* - * Update the start or insert page of a list - */ -void -IvfflatUpdateList(Relation index, GenericXLogState *state, ListInfo listInfo, - BlockNumber insertPage, BlockNumber originalInsertPage, - BlockNumber startPage, ForkNumber forkNum) -{ - Buffer buf; - Page page; - IvfflatList list; - bool changed = false; - - buf = ReadBufferExtended(index, forkNum, listInfo.blkno, RBM_NORMAL, NULL); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - state = GenericXLogStart(index); - page = GenericXLogRegisterBuffer(state, buf, 0); - list = (IvfflatList) PageGetItem(page, PageGetItemId(page, listInfo.offno)); - - if (BlockNumberIsValid(insertPage) && insertPage != list->insertPage) - { - /* Skip update if insert page is lower than original insert page */ - /* This is needed to prevent insert from overwriting vacuum */ - if (!BlockNumberIsValid(originalInsertPage) || insertPage >= originalInsertPage) - { - list->insertPage = insertPage; - changed = true; - } - } - - if (BlockNumberIsValid(startPage) && startPage != list->startPage) - { - list->startPage = startPage; - changed = true; - } - - /* Only commit if changed */ - if (changed) - IvfflatCommitBuffer(buf, state); - else - { - GenericXLogAbort(state); - UnlockReleaseBuffer(buf); - } -} -#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/ivfvacuum.c b/src/postgres/third-party-extensions/pgvector/src/ivfvacuum.c deleted file mode 100644 index 3bbedf4e1950..000000000000 --- a/src/postgres/third-party-extensions/pgvector/src/ivfvacuum.c +++ /dev/null @@ -1,161 +0,0 @@ -#ifdef YB_IVFFLAT_INDEX_SUPPORT -#include "postgres.h" - -#include "commands/vacuum.h" -#include "ivfflat.h" -#include "storage/bufmgr.h" - -/* - * Bulk delete tuples from the index - */ -IndexBulkDeleteResult * -ivfflatbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, - IndexBulkDeleteCallback callback, void *callback_state) -{ - Relation index = info->index; - Buffer cbuf; - Page cpage; - Buffer buf; - Page page; - IvfflatList list; - IndexTuple itup; - ItemPointer htup; - OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable; - BlockNumber startPages[MaxOffsetNumber]; - BlockNumber nextblkno = IVFFLAT_HEAD_BLKNO; - BlockNumber searchPage; - BlockNumber insertPage; - GenericXLogState *state; - OffsetNumber coffno; - OffsetNumber cmaxoffno; - OffsetNumber offno; - OffsetNumber maxoffno; - ListInfo listInfo; - BufferAccessStrategy bas = GetAccessStrategy(BAS_BULKREAD); - - if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - - /* Iterate over list pages */ - while (BlockNumberIsValid(nextblkno)) - { - cbuf = ReadBuffer(index, nextblkno); - LockBuffer(cbuf, BUFFER_LOCK_SHARE); - cpage = BufferGetPage(cbuf); - - cmaxoffno = PageGetMaxOffsetNumber(cpage); - - /* Iterate over lists */ - for (coffno = FirstOffsetNumber; coffno <= cmaxoffno; coffno = OffsetNumberNext(coffno)) - { - list = (IvfflatList) PageGetItem(cpage, PageGetItemId(cpage, coffno)); - startPages[coffno - FirstOffsetNumber] = list->startPage; - } - - listInfo.blkno = nextblkno; - nextblkno = IvfflatPageGetOpaque(cpage)->nextblkno; - - UnlockReleaseBuffer(cbuf); - - for (coffno = FirstOffsetNumber; coffno <= cmaxoffno; coffno = OffsetNumberNext(coffno)) - { - searchPage = startPages[coffno - FirstOffsetNumber]; - insertPage = InvalidBlockNumber; - - /* Iterate over entry pages */ - while (BlockNumberIsValid(searchPage)) - { - vacuum_delay_point(); - - buf = ReadBufferExtended(index, MAIN_FORKNUM, searchPage, RBM_NORMAL, bas); - - /* - * ambulkdelete cannot delete entries from pages that are - * pinned by other backends - * - * https://www.postgresql.org/docs/current/index-locking.html - */ - LockBufferForCleanup(buf); - - state = GenericXLogStart(index); - page = GenericXLogRegisterBuffer(state, buf, 0); - - maxoffno = PageGetMaxOffsetNumber(page); - ndeletable = 0; - - /* Find deleted tuples */ - for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) - { - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); - htup = &(itup->t_tid); - - if (callback(htup, callback_state)) - { - deletable[ndeletable++] = offno; - stats->tuples_removed++; - } - else - stats->num_index_tuples++; - } - - /* Set to first free page */ - /* Must be set before searchPage is updated */ - if (!BlockNumberIsValid(insertPage) && ndeletable > 0) - insertPage = searchPage; - - searchPage = IvfflatPageGetOpaque(page)->nextblkno; - - if (ndeletable > 0) - { - /* Delete tuples */ - PageIndexMultiDelete(page, deletable, ndeletable); - MarkBufferDirty(buf); - GenericXLogFinish(state); - } - else - GenericXLogAbort(state); - - UnlockReleaseBuffer(buf); - } - - /* - * Update after all tuples deleted. - * - * We don't add or delete items from lists pages, so offset won't - * change. - */ - if (BlockNumberIsValid(insertPage)) - { - listInfo.offno = coffno; - IvfflatUpdateList(index, state, listInfo, insertPage, InvalidBlockNumber, InvalidBlockNumber, MAIN_FORKNUM); - } - } - } - - FreeAccessStrategy(bas); - - return stats; -} - -/* - * Clean up after a VACUUM operation - */ -IndexBulkDeleteResult * -ivfflatvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) -{ - Relation rel = info->index; - - if (info->analyze_only) - return stats; - - /* stats is NULL if ambulkdelete not called */ - /* OK to return NULL if index not changed */ - if (stats == NULL) - return NULL; - - stats->num_pages = RelationGetNumberOfBlocks(rel); - - return stats; -} -#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/sparsevec.c b/src/postgres/third-party-extensions/pgvector/src/sparsevec.c new file mode 100644 index 000000000000..1893752d8459 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/sparsevec.c @@ -0,0 +1,1256 @@ +#include "postgres.h" + +#include +#include + +#include "catalog/pg_type.h" +#include "common/shortest_dec.h" +#include "common/string.h" +#include "fmgr.h" +#include "halfutils.h" +#include "halfvec.h" +#include "libpq/pqformat.h" +#include "sparsevec.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/float.h" +#include "utils/lsyscache.h" +#include "vector.h" + +typedef struct SparseInputElement +{ + int32 index; + float value; +} SparseInputElement; + +/* + * Ensure same dimensions + */ +static inline void +CheckDims(SparseVector * a, SparseVector * b) +{ + if (a->dim != b->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different sparsevec dimensions %d and %d", a->dim, b->dim))); +} + +/* + * Ensure expected dimensions + */ +static inline void +CheckExpectedDim(int32 typmod, int dim) +{ + if (typmod != -1 && typmod != dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected %d dimensions, not %d", typmod, dim))); +} + +/* + * Ensure valid dimensions + */ +static inline void +CheckDim(int dim) +{ + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("sparsevec must have at least 1 dimension"))); + + if (dim > SPARSEVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("sparsevec cannot have more than %d dimensions", SPARSEVEC_MAX_DIM))); +} + +/* + * Ensure valid nnz + */ +static inline void +CheckNnz(int nnz, int dim) +{ + if (nnz < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("sparsevec cannot have negative number of elements"))); + + if (nnz > SPARSEVEC_MAX_NNZ) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("sparsevec cannot have more than %d non-zero elements", SPARSEVEC_MAX_NNZ))); + + if (nnz > dim) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("sparsevec cannot have more elements than dimensions"))); +} + +/* + * Ensure valid index + */ +static inline void +CheckIndex(int32 *indices, int i, int dim) +{ + int32 index = indices[i]; + + if (index < 0 || index >= dim) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("sparsevec index out of bounds"))); + } + + if (i > 0) + { + if (index < indices[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("sparsevec indices must be in ascending order"))); + + if (index == indices[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("sparsevec indices must not contain duplicates"))); + } +} + +/* + * Ensure finite element + */ +static inline void +CheckElement(float value) +{ + if (isnan(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("NaN not allowed in sparsevec"))); + + if (isinf(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("infinite value not allowed in sparsevec"))); +} + +/* + * Allocate and initialize a new sparse vector + */ +SparseVector * +InitSparseVector(int dim, int nnz) +{ + SparseVector *result; + int size; + + size = SPARSEVEC_SIZE(nnz); + result = (SparseVector *) palloc0(size); + SET_VARSIZE(result, size); + result->dim = dim; + result->nnz = nnz; + + return result; +} + +/* + * Check for whitespace, since array_isspace() is static + */ +static inline bool +sparsevec_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\v' || + ch == '\f') + return true; + return false; +} + +/* + * Compare indices + */ +static int +CompareIndices(const void *a, const void *b) +{ + if (((SparseInputElement *) a)->index < ((SparseInputElement *) b)->index) + return -1; + + if (((SparseInputElement *) a)->index > ((SparseInputElement *) b)->index) + return 1; + + return 0; +} + +/* + * Convert textual representation to internal representation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_in); +Datum +sparsevec_in(PG_FUNCTION_ARGS) +{ + char *lit = PG_GETARG_CSTRING(0); + int32 typmod = PG_GETARG_INT32(2); + long dim; + char *pt = lit; + char *stringEnd; + SparseVector *result; + float *rvalues; + SparseInputElement *elements; + int maxNnz; + int nnz = 0; + + maxNnz = 1; + while (*pt != '\0') + { + if (*pt == ',') + maxNnz++; + + pt++; + } + + if (maxNnz > SPARSEVEC_MAX_NNZ) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("sparsevec cannot have more than %d non-zero elements", SPARSEVEC_MAX_NNZ))); + + elements = palloc(maxNnz * sizeof(SparseInputElement)); + + pt = lit; + + while (sparsevec_isspace(*pt)) + pt++; + + if (*pt != '{') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit), + errdetail("Vector contents must start with \"{\"."))); + + pt++; + + while (sparsevec_isspace(*pt)) + pt++; + + if (*pt == '}') + pt++; + else + { + for (;;) + { + long index; + float value; + + if (nnz == maxNnz) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("ran out of buffer: \"%s\"", lit))); + + while (sparsevec_isspace(*pt)) + pt++; + + /* Check for empty string like float4in */ + if (*pt == '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + /* Use similar logic as int2vectorin */ + index = strtol(pt, &stringEnd, 10); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + /* Keep in int range for correct error message later */ + if (index > INT_MAX) + index = INT_MAX; + else if (index < INT_MIN + 1) + index = INT_MIN + 1; + + pt = stringEnd; + + while (sparsevec_isspace(*pt)) + pt++; + + if (*pt != ':') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + pt++; + + while (sparsevec_isspace(*pt)) + pt++; + + errno = 0; + + /* Use strtof like float4in to avoid a double-rounding problem */ + /* Postgres sets LC_NUMERIC to C on startup */ + value = strtof(pt, &stringEnd); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + /* Check for range error like float4in */ + if (errno == ERANGE && (value == 0 || isinf(value))) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"%s\" is out of range for type sparsevec", pnstrdup(pt, stringEnd - pt)))); + + CheckElement(value); + + /* Do not store zero values */ + if (value != 0) + { + /* Convert 1-based numbering (SQL) to 0-based (C) */ + elements[nnz].index = index - 1; + elements[nnz].value = value; + nnz++; + } + + pt = stringEnd; + + while (sparsevec_isspace(*pt)) + pt++; + + if (*pt == ',') + pt++; + else if (*pt == '}') + { + pt++; + break; + } + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + } + } + + while (sparsevec_isspace(*pt)) + pt++; + + if (*pt != '/') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit), + errdetail("Unexpected end of input."))); + + pt++; + + while (sparsevec_isspace(*pt)) + pt++; + + /* Use similar logic as int2vectorin */ + dim = strtol(pt, &stringEnd, 10); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + /* Keep in int range for correct error message later */ + if (dim > INT_MAX) + dim = INT_MAX; + else if (dim < INT_MIN) + dim = INT_MIN; + + pt = stringEnd; + + /* Only whitespace is allowed after the closing brace */ + while (sparsevec_isspace(*pt)) + pt++; + + if (*pt != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit), + errdetail("Junk after closing."))); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + qsort(elements, nnz, sizeof(SparseInputElement), CompareIndices); + + result = InitSparseVector(dim, nnz); + rvalues = SPARSEVEC_VALUES(result); + for (int i = 0; i < nnz; i++) + { + result->indices[i] = elements[i].index; + rvalues[i] = elements[i].value; + + CheckIndex(result->indices, i, dim); + } + + PG_RETURN_POINTER(result); +} + +#define AppendChar(ptr, c) (*(ptr)++ = (c)) +#define AppendFloat(ptr, f) ((ptr) += float_to_shortest_decimal_bufn((f), (ptr))) + +#if PG_VERSION_NUM >= 140000 +#define AppendInt(ptr, i) ((ptr) += pg_ltoa((i), (ptr))) +#else +#define AppendInt(ptr, i) \ + do { \ + pg_ltoa(i, ptr); \ + while (*ptr != '\0') \ + ptr++; \ + } while (0) +#endif + +/* + * Convert internal representation to textual representation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_out); +Datum +sparsevec_out(PG_FUNCTION_ARGS) +{ + SparseVector *sparsevec = PG_GETARG_SPARSEVEC_P(0); + float *values = SPARSEVEC_VALUES(sparsevec); + char *buf; + char *ptr; + + /* + * Need: + * + * nnz * 10 bytes for index (positive integer) + * + * nnz bytes for : + * + * nnz * (FLOAT_SHORTEST_DECIMAL_LEN - 1) bytes for + * float_to_shortest_decimal_bufn + * + * nnz - 1 bytes for , + * + * 10 bytes for dimensions + * + * 4 bytes for {, }, /, and \0 + */ + buf = (char *) palloc((11 + FLOAT_SHORTEST_DECIMAL_LEN) * sparsevec->nnz + 13); + ptr = buf; + + AppendChar(ptr, '{'); + + for (int i = 0; i < sparsevec->nnz; i++) + { + if (i > 0) + AppendChar(ptr, ','); + + /* Convert 0-based numbering (C) to 1-based (SQL) */ + AppendInt(ptr, sparsevec->indices[i] + 1); + AppendChar(ptr, ':'); + AppendFloat(ptr, values[i]); + } + + AppendChar(ptr, '}'); + AppendChar(ptr, '/'); + AppendInt(ptr, sparsevec->dim); + *ptr = '\0'; + + PG_FREE_IF_COPY(sparsevec, 0); + PG_RETURN_CSTRING(buf); +} + +/* + * Convert type modifier + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_typmod_in); +Datum +sparsevec_typmod_in(PG_FUNCTION_ARGS) +{ + ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0); + int32 *tl; + int n; + + tl = ArrayGetIntegerTypmods(ta, &n); + + if (n != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid type modifier"))); + + if (*tl < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type sparsevec must be at least 1"))); + + if (*tl > SPARSEVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type sparsevec cannot exceed %d", SPARSEVEC_MAX_DIM))); + + PG_RETURN_INT32(*tl); +} + +/* + * Convert external binary representation to internal representation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_recv); +Datum +sparsevec_recv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + int32 typmod = PG_GETARG_INT32(2); + SparseVector *result; + int32 dim; + int32 nnz; + int32 unused; + float *values; + + dim = pq_getmsgint(buf, sizeof(int32)); + nnz = pq_getmsgint(buf, sizeof(int32)); + unused = pq_getmsgint(buf, sizeof(int32)); + + CheckDim(dim); + CheckNnz(nnz, dim); + CheckExpectedDim(typmod, dim); + + if (unused != 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected unused to be 0, not %d", unused))); + + result = InitSparseVector(dim, nnz); + values = SPARSEVEC_VALUES(result); + + /* Binary representation uses zero-based numbering for indices */ + for (int i = 0; i < nnz; i++) + { + result->indices[i] = pq_getmsgint(buf, sizeof(int32)); + CheckIndex(result->indices, i, dim); + } + + for (int i = 0; i < nnz; i++) + { + values[i] = pq_getmsgfloat4(buf); + CheckElement(values[i]); + + if (values[i] == 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("binary representation of sparsevec cannot contain zero values"))); + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to the external binary representation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_send); +Datum +sparsevec_send(PG_FUNCTION_ARGS) +{ + SparseVector *svec = PG_GETARG_SPARSEVEC_P(0); + float *values = SPARSEVEC_VALUES(svec); + StringInfoData buf; + + pq_begintypsend(&buf); + pq_sendint(&buf, svec->dim, sizeof(int32)); + pq_sendint(&buf, svec->nnz, sizeof(int32)); + pq_sendint(&buf, svec->unused, sizeof(int32)); + + /* Binary representation uses zero-based numbering for indices */ + for (int i = 0; i < svec->nnz; i++) + pq_sendint(&buf, svec->indices[i], sizeof(int32)); + + for (int i = 0; i < svec->nnz; i++) + pq_sendfloat4(&buf, values[i]); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +/* + * Convert sparse vector to sparse vector + * This is needed to check the type modifier + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec); +Datum +sparsevec(PG_FUNCTION_ARGS) +{ + SparseVector *svec = PG_GETARG_SPARSEVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + + CheckExpectedDim(typmod, svec->dim); + + PG_RETURN_POINTER(svec); +} + +/* + * Convert dense vector to sparse vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_to_sparsevec); +Datum +vector_to_sparsevec(PG_FUNCTION_ARGS) +{ + Vector *vec = PG_GETARG_VECTOR_P(0); + int32 typmod = PG_GETARG_INT32(1); + SparseVector *result; + int dim = vec->dim; + int nnz = 0; + float *values; + int j = 0; + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + for (int i = 0; i < dim; i++) + { + if (vec->x[i] != 0) + nnz++; + } + + result = InitSparseVector(dim, nnz); + values = SPARSEVEC_VALUES(result); + for (int i = 0; i < dim; i++) + { + if (vec->x[i] != 0) + { + /* Safety check */ + if (j >= result->nnz) + elog(ERROR, "safety check failed"); + + result->indices[j] = i; + values[j] = vec->x[i]; + j++; + } + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert half vector to sparse vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_to_sparsevec); +Datum +halfvec_to_sparsevec(PG_FUNCTION_ARGS) +{ + HalfVector *vec = PG_GETARG_HALFVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + SparseVector *result; + int dim = vec->dim; + int nnz = 0; + float *values; + int j = 0; + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + for (int i = 0; i < dim; i++) + { + if (!HalfIsZero(vec->x[i])) + nnz++; + } + + result = InitSparseVector(dim, nnz); + values = SPARSEVEC_VALUES(result); + for (int i = 0; i < dim; i++) + { + if (!HalfIsZero(vec->x[i])) + { + /* Safety check */ + if (j >= result->nnz) + elog(ERROR, "safety check failed"); + + result->indices[j] = i; + values[j] = HalfToFloat4(vec->x[i]); + j++; + } + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert array to sparse vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(array_to_sparsevec); +Datum +array_to_sparsevec(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + int32 typmod = PG_GETARG_INT32(1); + SparseVector *result; + int16 typlen; + bool typbyval; + char typalign; + Datum *elemsp; + int nelemsp; + int nnz = 0; + float *values; + int j = 0; + + if (ARR_NDIM(array) > 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("array must be 1-D"))); + + if (ARR_HASNULL(array) && array_contains_nulls(array)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array must not contain nulls"))); + + get_typlenbyvalalign(ARR_ELEMTYPE(array), &typlen, &typbyval, &typalign); + deconstruct_array(array, ARR_ELEMTYPE(array), typlen, typbyval, typalign, &elemsp, NULL, &nelemsp); + + CheckDim(nelemsp); + CheckExpectedDim(typmod, nelemsp); + +#ifdef _MSC_VER +/* /fp:fast may not propagate +/-Infinity or NaN */ +#define IS_NOT_ZERO(v) (isnan((float) (v)) || isinf((float) (v)) || ((float) (v)) != 0) +#else +#define IS_NOT_ZERO(v) (((float) (v)) != 0) +#endif + + if (ARR_ELEMTYPE(array) == INT4OID) + { + for (int i = 0; i < nelemsp; i++) + nnz += IS_NOT_ZERO(DatumGetInt32(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == FLOAT8OID) + { + for (int i = 0; i < nelemsp; i++) + nnz += IS_NOT_ZERO(DatumGetFloat8(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == FLOAT4OID) + { + for (int i = 0; i < nelemsp; i++) + nnz += IS_NOT_ZERO(DatumGetFloat4(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == NUMERICOID) + { + for (int i = 0; i < nelemsp; i++) + nnz += IS_NOT_ZERO(DirectFunctionCall1(numeric_float4, elemsp[i])); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("unsupported array type"))); + } + + result = InitSparseVector(nelemsp, nnz); + values = SPARSEVEC_VALUES(result); + +#define PROCESS_ARRAY_ELEM(elem) \ + do { \ + float v = (float) (elem); \ + if (IS_NOT_ZERO(v)) { \ + /* Safety check */ \ + if (j >= result->nnz) \ + elog(ERROR, "safety check failed"); \ + result->indices[j] = i; \ + values[j] = v; \ + j++; \ + } \ + } while (0) + + if (ARR_ELEMTYPE(array) == INT4OID) + { + for (int i = 0; i < nelemsp; i++) + PROCESS_ARRAY_ELEM(DatumGetInt32(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == FLOAT8OID) + { + for (int i = 0; i < nelemsp; i++) + PROCESS_ARRAY_ELEM(DatumGetFloat8(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == FLOAT4OID) + { + for (int i = 0; i < nelemsp; i++) + PROCESS_ARRAY_ELEM(DatumGetFloat4(elemsp[i])); + } + else if (ARR_ELEMTYPE(array) == NUMERICOID) + { + for (int i = 0; i < nelemsp; i++) + PROCESS_ARRAY_ELEM(DatumGetFloat4(DirectFunctionCall1(numeric_float4, elemsp[i]))); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("unsupported array type"))); + } + +#undef PROCESS_ARRAY_ELEM +#undef IS_NOT_ZERO + + /* + * Free allocation from deconstruct_array. Do not free individual elements + * when pass-by-reference since they point to original array. + */ + pfree(elemsp); + + if (j != result->nnz) + elog(ERROR, "correctness check failed"); + + /* Check elements */ + for (int i = 0; i < result->nnz; i++) + CheckElement(values[i]); + + PG_RETURN_POINTER(result); +} + +/* + * Get the L2 squared distance between sparse vectors + */ +static float +SparsevecL2SquaredDistance(SparseVector * a, SparseVector * b) +{ + float *ax = SPARSEVEC_VALUES(a); + float *bx = SPARSEVEC_VALUES(b); + float distance = 0.0; + int bpos = 0; + + for (int i = 0; i < a->nnz; i++) + { + int ai = a->indices[i]; + int bi = -1; + + for (int j = bpos; j < b->nnz; j++) + { + bi = b->indices[j]; + + if (ai == bi) + { + float diff = ax[i] - bx[j]; + + distance += diff * diff; + } + else if (ai > bi) + distance += bx[j] * bx[j]; + + /* Update start for next iteration */ + if (ai >= bi) + bpos = j + 1; + + /* Found or passed it */ + if (bi >= ai) + break; + } + + if (ai != bi) + distance += ax[i] * ax[i]; + } + + for (int j = bpos; j < b->nnz; j++) + distance += bx[j] * bx[j]; + + return distance; +} + +/* + * Get the L2 distance between sparse vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_l2_distance); +Datum +sparsevec_l2_distance(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(sqrt((double) SparsevecL2SquaredDistance(a, b))); +} + +/* + * Get the L2 squared distance between sparse vectors + * This saves a sqrt calculation + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_l2_squared_distance); +Datum +sparsevec_l2_squared_distance(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) SparsevecL2SquaredDistance(a, b)); +} + +/* + * Get the inner product of two sparse vectors + */ +static float +SparsevecInnerProduct(SparseVector * a, SparseVector * b) +{ + float *ax = SPARSEVEC_VALUES(a); + float *bx = SPARSEVEC_VALUES(b); + float distance = 0.0; + int bpos = 0; + + for (int i = 0; i < a->nnz; i++) + { + int ai = a->indices[i]; + + for (int j = bpos; j < b->nnz; j++) + { + int bi = b->indices[j]; + + /* Only update when the same index */ + if (ai == bi) + distance += ax[i] * bx[j]; + + /* Update start for next iteration */ + if (ai >= bi) + bpos = j + 1; + + /* Found or passed it */ + if (bi >= ai) + break; + } + } + + return distance; +} + +/* + * Get the inner product of two sparse vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_inner_product); +Datum +sparsevec_inner_product(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) SparsevecInnerProduct(a, b)); +} + +/* + * Get the negative inner product of two sparse vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_negative_inner_product); +Datum +sparsevec_negative_inner_product(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) -SparsevecInnerProduct(a, b)); +} + +/* + * Get the cosine distance between two sparse vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_cosine_distance); +Datum +sparsevec_cosine_distance(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + float *ax = SPARSEVEC_VALUES(a); + float *bx = SPARSEVEC_VALUES(b); + float norma = 0.0; + float normb = 0.0; + double similarity; + + CheckDims(a, b); + + similarity = SparsevecInnerProduct(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->nnz; i++) + norma += ax[i] * ax[i]; + + /* Auto-vectorized */ + for (int i = 0; i < b->nnz; i++) + normb += bx[i] * bx[i]; + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + similarity /= sqrt((double) norma * (double) normb); + +#ifdef _MSC_VER + /* /fp:fast may not propagate NaN */ + if (isnan(similarity)) + PG_RETURN_FLOAT8(NAN); +#endif + + /* Keep in range */ + if (similarity > 1) + similarity = 1.0; + else if (similarity < -1) + similarity = -1.0; + + PG_RETURN_FLOAT8(1.0 - similarity); +} + +/* + * Get the L1 distance between two sparse vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_l1_distance); +Datum +sparsevec_l1_distance(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + float *ax = SPARSEVEC_VALUES(a); + float *bx = SPARSEVEC_VALUES(b); + float distance = 0.0; + int bpos = 0; + + CheckDims(a, b); + + for (int i = 0; i < a->nnz; i++) + { + int ai = a->indices[i]; + int bi = -1; + + for (int j = bpos; j < b->nnz; j++) + { + bi = b->indices[j]; + + if (ai == bi) + distance += fabsf(ax[i] - bx[j]); + else if (ai > bi) + distance += fabsf(bx[j]); + + /* Update start for next iteration */ + if (ai >= bi) + bpos = j + 1; + + /* Found or passed it */ + if (bi >= ai) + break; + } + + if (ai != bi) + distance += fabsf(ax[i]); + } + + for (int j = bpos; j < b->nnz; j++) + distance += fabsf(bx[j]); + + PG_RETURN_FLOAT8((double) distance); +} + +/* + * Get the L2 norm of a sparse vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_l2_norm); +Datum +sparsevec_l2_norm(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + float *ax = SPARSEVEC_VALUES(a); + double norm = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < a->nnz; i++) + norm += (double) ax[i] * (double) ax[i]; + + PG_RETURN_FLOAT8(sqrt(norm)); +} + +/* + * Normalize a sparse vector with the L2 norm + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_l2_normalize); +Datum +sparsevec_l2_normalize(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + float *ax = SPARSEVEC_VALUES(a); + double norm = 0; + SparseVector *result; + float *rx; + + result = InitSparseVector(a->dim, a->nnz); + rx = SPARSEVEC_VALUES(result); + + /* Auto-vectorized */ + for (int i = 0; i < a->nnz; i++) + norm += (double) ax[i] * (double) ax[i]; + + norm = sqrt(norm); + + /* Return zero vector for zero norm */ + if (norm > 0) + { + int zeros = 0; + + for (int i = 0; i < a->nnz; i++) + { + result->indices[i] = a->indices[i]; + rx[i] = ax[i] / norm; + + if (isinf(rx[i])) + float_overflow_error(); + + if (rx[i] == 0) + zeros++; + } + + /* Allocate a new vector in the unlikely event there are zeros */ + if (zeros > 0) + { + SparseVector *newResult = InitSparseVector(result->dim, result->nnz - zeros); + float *nx = SPARSEVEC_VALUES(newResult); + int j = 0; + + for (int i = 0; i < result->nnz; i++) + { + if (rx[i] == 0) + continue; + + /* Safety check */ + if (j >= newResult->nnz) + elog(ERROR, "safety check failed"); + + newResult->indices[j] = result->indices[i]; + nx[j] = rx[i]; + j++; + } + + pfree(result); + + PG_RETURN_POINTER(newResult); + } + } + + PG_RETURN_POINTER(result); +} + +/* + * Internal helper to compare sparse vectors + */ +static int +sparsevec_cmp_internal(SparseVector * a, SparseVector * b) +{ + float *ax = SPARSEVEC_VALUES(a); + float *bx = SPARSEVEC_VALUES(b); + int nnz = Min(a->nnz, b->nnz); + + /* Check values before dimensions to be consistent with Postgres arrays */ + for (int i = 0; i < nnz; i++) + { + if (a->indices[i] < b->indices[i]) + return ax[i] < 0 ? -1 : 1; + + if (a->indices[i] > b->indices[i]) + return bx[i] < 0 ? 1 : -1; + + if (ax[i] < bx[i]) + return -1; + + if (ax[i] > bx[i]) + return 1; + } + + if (a->nnz < b->nnz && b->indices[nnz] < a->dim) + return bx[nnz] < 0 ? 1 : -1; + + if (a->nnz > b->nnz && a->indices[nnz] < b->dim) + return ax[nnz] < 0 ? -1 : 1; + + if (a->dim < b->dim) + return -1; + + if (a->dim > b->dim) + return 1; + + return 0; +} + +/* + * Less than + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_lt); +Datum +sparsevec_lt(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + PG_RETURN_BOOL(sparsevec_cmp_internal(a, b) < 0); +} + +/* + * Less than or equal + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_le); +Datum +sparsevec_le(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + PG_RETURN_BOOL(sparsevec_cmp_internal(a, b) <= 0); +} + +/* + * Equal + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_eq); +Datum +sparsevec_eq(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + PG_RETURN_BOOL(sparsevec_cmp_internal(a, b) == 0); +} + +/* + * Not equal + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_ne); +Datum +sparsevec_ne(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + PG_RETURN_BOOL(sparsevec_cmp_internal(a, b) != 0); +} + +/* + * Greater than or equal + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_ge); +Datum +sparsevec_ge(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + PG_RETURN_BOOL(sparsevec_cmp_internal(a, b) >= 0); +} + +/* + * Greater than + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_gt); +Datum +sparsevec_gt(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + PG_RETURN_BOOL(sparsevec_cmp_internal(a, b) > 0); +} + +/* + * Compare sparse vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_cmp); +Datum +sparsevec_cmp(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + PG_RETURN_INT32(sparsevec_cmp_internal(a, b)); +} diff --git a/src/postgres/third-party-extensions/pgvector/src/sparsevec.h b/src/postgres/third-party-extensions/pgvector/src/sparsevec.h new file mode 100644 index 000000000000..e663c519af58 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/src/sparsevec.h @@ -0,0 +1,40 @@ +#ifndef SPARSEVEC_H +#define SPARSEVEC_H + +#define SPARSEVEC_MAX_DIM 1000000000 +#define SPARSEVEC_MAX_NNZ 16000 + +#define DatumGetSparseVector(x) ((SparseVector *) PG_DETOAST_DATUM(x)) +#define PG_GETARG_SPARSEVEC_P(x) DatumGetSparseVector(PG_GETARG_DATUM(x)) +#define PG_RETURN_SPARSEVEC_P(x) PG_RETURN_POINTER(x) + +/* + * Indices use 0-based numbering for the on-disk (and binary) format (consistent with C) + * and are always sorted. Values come after indices. + */ +typedef struct SparseVector +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int32 dim; /* number of dimensions */ + int32 nnz; /* number of non-zero elements */ + int32 unused; /* reserved for future use, always zero */ + int32 indices[FLEXIBLE_ARRAY_MEMBER]; +} SparseVector; + +/* Use functions instead of macros to avoid double evaluation */ + +static inline Size +SPARSEVEC_SIZE(int nnz) +{ + return offsetof(SparseVector, indices) + (nnz * sizeof(int32)) + (nnz * sizeof(float)); +} + +static inline float * +SPARSEVEC_VALUES(SparseVector * x) +{ + return (float *) (((char *) x) + offsetof(SparseVector, indices) + (x->nnz * sizeof(int32))); +} + +SparseVector *InitSparseVector(int dim, int nnz); + +#endif diff --git a/src/postgres/third-party-extensions/pgvector/src/vector.c b/src/postgres/third-party-extensions/pgvector/src/vector.c index 62b41250cc80..d79edbba4c71 100644 --- a/src/postgres/third-party-extensions/pgvector/src/vector.c +++ b/src/postgres/third-party-extensions/pgvector/src/vector.c @@ -2,42 +2,43 @@ #include -#include "vector.h" -#include "fmgr.h" +#include "bitutils.h" +#include "bitvec.h" #include "catalog/pg_type.h" +#include "common/shortest_dec.h" +#include "fmgr.h" +#include "halfutils.h" +#include "halfvec.h" +/* #include "hnsw.h" */ /* YB: Unused in YB. */ +/* #include "ivfflat.h" */ /* YB: Unused in YB. */ #include "lib/stringinfo.h" #include "libpq/pqformat.h" +#include "port.h" /* for strtof() */ +#include "sparsevec.h" #include "utils/array.h" #include "utils/builtins.h" +#include "utils/float.h" #include "utils/lsyscache.h" #include "utils/numeric.h" +#include "vector.h" -#if PG_VERSION_NUM >= 120000 -#include "common/shortest_dec.h" -#include "utils/float.h" -#else -#include -/* - * YB Note: PG versions < 12 declared the int 'extra_float_digits' in - * utils/builtins.h. As part of commit a09ec42b5cadc5993da902c52c2e399f012eeebf - * this was moved to utils/float.h. To avoid including the header file for a - * single int, 'extra_float_digits' is declared here. - */ -extern PGDLLIMPORT int extra_float_digits; -#endif - -#if PG_VERSION_NUM < 130000 -#define TYPALIGN_DOUBLE 'd' -#define TYPALIGN_INT 'i' +#if PG_VERSION_NUM >= 160000 +#include "varatt.h" #endif #define STATE_DIMS(x) (ARR_DIMS(x)[0] - 1) #define CreateStateDatums(dim) palloc(sizeof(Datum) * (dim + 1)) -extern void YbVectorInit(); +#if defined(USE_TARGET_CLONES) && !defined(__FMA__) +#define VECTOR_TARGET_CLONES __attribute__((target_clones("default", "fma"))) +#else +#define VECTOR_TARGET_CLONES +#endif PG_MODULE_MAGIC; +extern void YbVectorInit(); + /* * Initialize index options and variables */ @@ -45,6 +46,12 @@ PGDLLEXPORT void _PG_init(void); void _PG_init(void) { + BitvecInit(); + HalfvecInit(); + + /* YB note: Not used in YB as they are unsupported. */ + /* HnswInit(); */ + /* IvfflatInit(); */ YbVectorInit(); } @@ -90,7 +97,7 @@ CheckDim(int dim) } /* - * Ensure finite elements + * Ensure finite element */ static inline void CheckElement(float value) @@ -106,6 +113,23 @@ CheckElement(float value) errmsg("infinite value not allowed in vector"))); } +/* + * Allocate and initialize a new vector + */ +Vector * +InitVector(int dim) +{ + Vector *result; + int size; + + size = VECTOR_SIZE(dim); + result = (Vector *) palloc0(size); + SET_VARSIZE(result, size); + result->dim = dim; + + return result; +} + /* * Check for whitespace, since array_isspace() is static */ @@ -136,48 +160,44 @@ CheckStateArray(ArrayType *statearray, const char *caller) return (float8 *) ARR_DATA_PTR(statearray); } -#if PG_VERSION_NUM < 120003 -static pg_noinline void -float_overflow_error(void) -{ - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value out of range: overflow"))); -} -#endif - /* * Convert textual representation to internal representation */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_in); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_in); Datum vector_in(PG_FUNCTION_ARGS) { - char *str = PG_GETARG_CSTRING(0); + char *lit = PG_GETARG_CSTRING(0); int32 typmod = PG_GETARG_INT32(2); - int i; float x[VECTOR_MAX_DIM]; int dim = 0; - char *pt; - char *stringEnd; + char *pt = lit; Vector *result; - char *lit = pstrdup(str); - while (vector_isspace(*str)) - str++; + while (vector_isspace(*pt)) + pt++; - if (*str != '[') + if (*pt != '[') ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("malformed vector literal: \"%s\"", lit), + errmsg("invalid input syntax for type vector: \"%s\"", lit), errdetail("Vector contents must start with \"[\"."))); - str++; - pt = strtok(str, ","); - stringEnd = pt; + pt++; + + while (vector_isspace(*pt)) + pt++; + + if (*pt == ']') + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("vector must have at least 1 dimension"))); - while (pt != NULL && *stringEnd != ']') + for (;;) { + float val; + char *stringEnd; + if (dim == VECTOR_MAX_DIM) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), @@ -192,74 +212,71 @@ vector_in(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type vector: \"%s\"", lit))); + errno = 0; + /* Use strtof like float4in to avoid a double-rounding problem */ - x[dim] = strtof(pt, &stringEnd); - CheckElement(x[dim]); - dim++; + /* Postgres sets LC_NUMERIC to C on startup */ + val = strtof(pt, &stringEnd); if (stringEnd == pt) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type vector: \"%s\"", lit))); - while (vector_isspace(*stringEnd)) - stringEnd++; + /* Check for range error like float4in */ + if (errno == ERANGE && isinf(val)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"%s\" is out of range for type vector", pnstrdup(pt, stringEnd - pt)))); + + CheckElement(val); + x[dim++] = val; + + pt = stringEnd; + + while (vector_isspace(*pt)) + pt++; - if (*stringEnd != '\0' && *stringEnd != ']') + if (*pt == ',') + pt++; + else if (*pt == ']') + { + pt++; + break; + } + else ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type vector: \"%s\"", lit))); - - pt = strtok(NULL, ","); } - if (stringEnd == NULL || *stringEnd != ']') - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("malformed vector literal: \"%s\"", lit), - errdetail("Unexpected end of input."))); - - stringEnd++; - /* Only whitespace is allowed after the closing brace */ - while (vector_isspace(*stringEnd)) - stringEnd++; + while (vector_isspace(*pt)) + pt++; - if (*stringEnd != '\0') + if (*pt != '\0') ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("malformed vector literal: \"%s\"", lit), + errmsg("invalid input syntax for type vector: \"%s\"", lit), errdetail("Junk after closing right brace."))); - /* Ensure no consecutive delimiters since strtok skips */ - for (pt = lit + 1; *pt != '\0'; pt++) - { - if (pt[-1] == ',' && *pt == ',') - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("malformed vector literal: \"%s\"", lit))); - } - - if (dim < 1) - ereport(ERROR, - (errcode(ERRCODE_DATA_EXCEPTION), - errmsg("vector must have at least 1 dimension"))); - - pfree(lit); - + CheckDim(dim); CheckExpectedDim(typmod, dim); result = InitVector(dim); - for (i = 0; i < dim; i++) + for (int i = 0; i < dim; i++) result->x[i] = x[i]; PG_RETURN_POINTER(result); } +#define AppendChar(ptr, c) (*(ptr)++ = (c)) +#define AppendFloat(ptr, f) ((ptr) += float_to_shortest_decimal_bufn((f), (ptr))) + /* * Convert internal representation to textual representation */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_out); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_out); Datum vector_out(PG_FUNCTION_ARGS) { @@ -267,17 +284,6 @@ vector_out(PG_FUNCTION_ARGS) int dim = vector->dim; char *buf; char *ptr; - int i; - int n; - -#if PG_VERSION_NUM < 120000 - int ndig = FLT_DIG + extra_float_digits; - - if (ndig < 1) - ndig = 1; - -#define FLOAT_SHORTEST_DECIMAL_LEN (ndig + 10) -#endif /* * Need: @@ -292,25 +298,17 @@ vector_out(PG_FUNCTION_ARGS) buf = (char *) palloc(FLOAT_SHORTEST_DECIMAL_LEN * dim + 2); ptr = buf; - *ptr = '['; - ptr++; - for (i = 0; i < dim; i++) + AppendChar(ptr, '['); + + for (int i = 0; i < dim; i++) { if (i > 0) - { - *ptr = ','; - ptr++; - } + AppendChar(ptr, ','); -#if PG_VERSION_NUM >= 120000 - n = float_to_shortest_decimal_bufn(vector->x[i], ptr); -#else - n = sprintf(ptr, "%.*g", ndig, vector->x[i]); -#endif - ptr += n; + AppendFloat(ptr, vector->x[i]); } - *ptr = ']'; - ptr++; + + AppendChar(ptr, ']'); *ptr = '\0'; PG_FREE_IF_COPY(vector, 0); @@ -332,7 +330,7 @@ PrintVector(char *msg, Vector * vector) /* * Convert type modifier */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_typmod_in); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_typmod_in); Datum vector_typmod_in(PG_FUNCTION_ARGS) { @@ -363,7 +361,7 @@ vector_typmod_in(PG_FUNCTION_ARGS) /* * Convert external binary representation to internal representation */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_recv); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_recv); Datum vector_recv(PG_FUNCTION_ARGS) { @@ -372,7 +370,6 @@ vector_recv(PG_FUNCTION_ARGS) Vector *result; int16 dim; int16 unused; - int i; dim = pq_getmsgint(buf, sizeof(int16)); unused = pq_getmsgint(buf, sizeof(int16)); @@ -386,7 +383,7 @@ vector_recv(PG_FUNCTION_ARGS) errmsg("expected unused to be 0, not %d", unused))); result = InitVector(dim); - for (i = 0; i < dim; i++) + for (int i = 0; i < dim; i++) { result->x[i] = pq_getmsgfloat4(buf); CheckElement(result->x[i]); @@ -398,18 +395,17 @@ vector_recv(PG_FUNCTION_ARGS) /* * Convert internal representation to the external binary representation */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_send); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_send); Datum vector_send(PG_FUNCTION_ARGS) { Vector *vec = PG_GETARG_VECTOR_P(0); StringInfoData buf; - int i; pq_begintypsend(&buf); pq_sendint(&buf, vec->dim, sizeof(int16)); pq_sendint(&buf, vec->unused, sizeof(int16)); - for (i = 0; i < vec->dim; i++) + for (int i = 0; i < vec->dim; i++) pq_sendfloat4(&buf, vec->x[i]); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); @@ -417,35 +413,34 @@ vector_send(PG_FUNCTION_ARGS) /* * Convert vector to vector + * This is needed to check the type modifier */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector); Datum vector(PG_FUNCTION_ARGS) { - Vector *arg = PG_GETARG_VECTOR_P(0); + Vector *vec = PG_GETARG_VECTOR_P(0); int32 typmod = PG_GETARG_INT32(1); - CheckExpectedDim(typmod, arg->dim); + CheckExpectedDim(typmod, vec->dim); - PG_RETURN_POINTER(arg); + PG_RETURN_POINTER(vec); } /* * Convert array to vector */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(array_to_vector); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(array_to_vector); Datum array_to_vector(PG_FUNCTION_ARGS) { ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); int32 typmod = PG_GETARG_INT32(1); - int i; Vector *result; int16 typlen; bool typbyval; char typalign; Datum *elemsp; - bool *nullsp; int nelemsp; if (ARR_NDIM(array) > 1) @@ -453,36 +448,55 @@ array_to_vector(PG_FUNCTION_ARGS) (errcode(ERRCODE_DATA_EXCEPTION), errmsg("array must be 1-D"))); + if (ARR_HASNULL(array) && array_contains_nulls(array)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array must not contain nulls"))); + get_typlenbyvalalign(ARR_ELEMTYPE(array), &typlen, &typbyval, &typalign); - deconstruct_array(array, ARR_ELEMTYPE(array), typlen, typbyval, typalign, &elemsp, &nullsp, &nelemsp); + deconstruct_array(array, ARR_ELEMTYPE(array), typlen, typbyval, typalign, &elemsp, NULL, &nelemsp); CheckDim(nelemsp); CheckExpectedDim(typmod, nelemsp); result = InitVector(nelemsp); - for (i = 0; i < nelemsp; i++) - { - if (nullsp[i]) - ereport(ERROR, - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), - errmsg("array must not containing NULLs"))); - /* TODO Move outside loop in 0.5.0 */ - if (ARR_ELEMTYPE(array) == INT4OID) + if (ARR_ELEMTYPE(array) == INT4OID) + { + for (int i = 0; i < nelemsp; i++) result->x[i] = DatumGetInt32(elemsp[i]); - else if (ARR_ELEMTYPE(array) == FLOAT8OID) + } + else if (ARR_ELEMTYPE(array) == FLOAT8OID) + { + for (int i = 0; i < nelemsp; i++) result->x[i] = DatumGetFloat8(elemsp[i]); - else if (ARR_ELEMTYPE(array) == FLOAT4OID) + } + else if (ARR_ELEMTYPE(array) == FLOAT4OID) + { + for (int i = 0; i < nelemsp; i++) result->x[i] = DatumGetFloat4(elemsp[i]); - else if (ARR_ELEMTYPE(array) == NUMERICOID) + } + else if (ARR_ELEMTYPE(array) == NUMERICOID) + { + for (int i = 0; i < nelemsp; i++) result->x[i] = DatumGetFloat4(DirectFunctionCall1(numeric_float4, elemsp[i])); - else - ereport(ERROR, - (errcode(ERRCODE_DATA_EXCEPTION), - errmsg("unsupported array type"))); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("unsupported array type"))); + } + + /* + * Free allocation from deconstruct_array. Do not free individual elements + * when pass-by-reference since they point to original array. + */ + pfree(elemsp); + /* Check elements */ + for (int i = 0; i < result->dim; i++) CheckElement(result->x[i]); - } PG_RETURN_POINTER(result); } @@ -490,18 +504,17 @@ array_to_vector(PG_FUNCTION_ARGS) /* * Convert vector to float4[] */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_to_float4); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_to_float4); Datum vector_to_float4(PG_FUNCTION_ARGS) { Vector *vec = PG_GETARG_VECTOR_P(0); Datum *datums; ArrayType *result; - int i; datums = (Datum *) palloc(sizeof(Datum) * vec->dim); - for (i = 0; i < vec->dim; i++) + for (int i = 0; i < vec->dim; i++) datums[i] = Float4GetDatum(vec->x[i]); /* Use TYPALIGN_INT for float4 */ @@ -513,129 +526,163 @@ vector_to_float4(PG_FUNCTION_ARGS) } /* - * Get the L2 distance between vectors + * Convert half vector to vector */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(halfvec_to_vector); Datum -l2_distance(PG_FUNCTION_ARGS) +halfvec_to_vector(PG_FUNCTION_ARGS) { - Vector *a = PG_GETARG_VECTOR_P(0); - Vector *b = PG_GETARG_VECTOR_P(1); - float *ax = a->x; - float *bx = b->x; - double distance = 0.0; - double diff; + HalfVector *vec = PG_GETARG_HALFVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + Vector *result; - CheckDims(a, b); + CheckDim(vec->dim); + CheckExpectedDim(typmod, vec->dim); + + result = InitVector(vec->dim); + + for (int i = 0; i < vec->dim; i++) + result->x[i] = HalfToFloat4(vec->x[i]); + + PG_RETURN_POINTER(result); +} + +VECTOR_TARGET_CLONES static float +VectorL2SquaredDistance(int dim, float *ax, float *bx) +{ + float distance = 0.0; /* Auto-vectorized */ - for (int i = 0; i < a->dim; i++) + for (int i = 0; i < dim; i++) { - diff = ax[i] - bx[i]; + float diff = ax[i] - bx[i]; + distance += diff * diff; } - PG_RETURN_FLOAT8(sqrt(distance)); + return distance; +} + +/* + * Get the L2 distance between vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(l2_distance); +Datum +l2_distance(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(sqrt((double) VectorL2SquaredDistance(a->dim, a->x, b->x))); } /* * Get the L2 squared distance between vectors * This saves a sqrt calculation */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_l2_squared_distance); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_l2_squared_distance); Datum vector_l2_squared_distance(PG_FUNCTION_ARGS) { Vector *a = PG_GETARG_VECTOR_P(0); Vector *b = PG_GETARG_VECTOR_P(1); - float *ax = a->x; - float *bx = b->x; - double distance = 0.0; - double diff; CheckDims(a, b); + PG_RETURN_FLOAT8((double) VectorL2SquaredDistance(a->dim, a->x, b->x)); +} + +VECTOR_TARGET_CLONES static float +VectorInnerProduct(int dim, float *ax, float *bx) +{ + float distance = 0.0; + /* Auto-vectorized */ - for (int i = 0; i < a->dim; i++) - { - diff = ax[i] - bx[i]; - distance += diff * diff; - } + for (int i = 0; i < dim; i++) + distance += ax[i] * bx[i]; - PG_RETURN_FLOAT8(distance); + return distance; } /* * Get the inner product of two vectors */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(inner_product); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(inner_product); Datum inner_product(PG_FUNCTION_ARGS) { Vector *a = PG_GETARG_VECTOR_P(0); Vector *b = PG_GETARG_VECTOR_P(1); - float *ax = a->x; - float *bx = b->x; - double distance = 0.0; CheckDims(a, b); - /* Auto-vectorized */ - for (int i = 0; i < a->dim; i++) - distance += ax[i] * bx[i]; - - PG_RETURN_FLOAT8(distance); + PG_RETURN_FLOAT8((double) VectorInnerProduct(a->dim, a->x, b->x)); } /* * Get the negative inner product of two vectors */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_negative_inner_product); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_negative_inner_product); Datum vector_negative_inner_product(PG_FUNCTION_ARGS) { Vector *a = PG_GETARG_VECTOR_P(0); Vector *b = PG_GETARG_VECTOR_P(1); - float *ax = a->x; - float *bx = b->x; - double distance = 0.0; CheckDims(a, b); + PG_RETURN_FLOAT8((double) -VectorInnerProduct(a->dim, a->x, b->x)); +} + +VECTOR_TARGET_CLONES static double +VectorCosineSimilarity(int dim, float *ax, float *bx) +{ + float similarity = 0.0; + float norma = 0.0; + float normb = 0.0; + /* Auto-vectorized */ - for (int i = 0; i < a->dim; i++) - distance += ax[i] * bx[i]; + for (int i = 0; i < dim; i++) + { + similarity += ax[i] * bx[i]; + norma += ax[i] * ax[i]; + normb += bx[i] * bx[i]; + } - PG_RETURN_FLOAT8(distance * -1); + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + return (double) similarity / sqrt((double) norma * (double) normb); } /* * Get the cosine distance between two vectors */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(cosine_distance); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(cosine_distance); Datum cosine_distance(PG_FUNCTION_ARGS) { Vector *a = PG_GETARG_VECTOR_P(0); Vector *b = PG_GETARG_VECTOR_P(1); - float *ax = a->x; - float *bx = b->x; - double distance = 0.0; - double norma = 0.0; - double normb = 0.0; + double similarity; CheckDims(a, b); - /* Auto-vectorized */ - for (int i = 0; i < a->dim; i++) - { - distance += ax[i] * bx[i]; - norma += ax[i] * ax[i]; - normb += bx[i] * bx[i]; - } + similarity = VectorCosineSimilarity(a->dim, a->x, b->x); - /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ - PG_RETURN_FLOAT8(1 - (distance / sqrt(norma * normb))); +#ifdef _MSC_VER + /* /fp:fast may not propagate NaN */ + if (isnan(similarity)) + PG_RETURN_FLOAT8(NAN); +#endif + + /* Keep in range */ + if (similarity > 1) + similarity = 1.0; + else if (similarity < -1) + similarity = -1.0; + + PG_RETURN_FLOAT8(1.0 - similarity); } /* @@ -643,19 +690,17 @@ cosine_distance(PG_FUNCTION_ARGS) * Currently uses angular distance since needs to satisfy triangle inequality * Assumes inputs are unit vectors (skips norm) */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_spherical_distance); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_spherical_distance); Datum vector_spherical_distance(PG_FUNCTION_ARGS) { Vector *a = PG_GETARG_VECTOR_P(0); Vector *b = PG_GETARG_VECTOR_P(1); - double distance = 0.0; + double distance; CheckDims(a, b); - /* Auto-vectorized */ - for (int i = 0; i < a->dim; i++) - distance += a->x[i] * b->x[i]; + distance = (double) VectorInnerProduct(a->dim, a->x, b->x); /* Prevent NaN with acos with loss of precision */ if (distance > 1) @@ -666,10 +711,38 @@ vector_spherical_distance(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(acos(distance) / M_PI); } +/* Does not require FMA, but keep logic simple */ +VECTOR_TARGET_CLONES static float +VectorL1Distance(int dim, float *ax, float *bx) +{ + float distance = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < dim; i++) + distance += fabsf(ax[i] - bx[i]); + + return distance; +} + +/* + * Get the L1 distance between two vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(l1_distance); +Datum +l1_distance(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8((double) VectorL1Distance(a->dim, a->x, b->x)); +} + /* * Get the dimensions of a vector */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_dims); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_dims); Datum vector_dims(PG_FUNCTION_ARGS) { @@ -681,7 +754,7 @@ vector_dims(PG_FUNCTION_ARGS) /* * Get the L2 norm of a vector */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_norm); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_norm); Datum vector_norm(PG_FUNCTION_ARGS) { @@ -691,15 +764,54 @@ vector_norm(PG_FUNCTION_ARGS) /* Auto-vectorized */ for (int i = 0; i < a->dim; i++) - norm += ax[i] * ax[i]; + norm += (double) ax[i] * (double) ax[i]; PG_RETURN_FLOAT8(sqrt(norm)); } +/* + * Normalize a vector with the L2 norm + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(l2_normalize); +Datum +l2_normalize(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + float *ax = a->x; + double norm = 0; + Vector *result; + float *rx; + + result = InitVector(a->dim); + rx = result->x; + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + norm += (double) ax[i] * (double) ax[i]; + + norm = sqrt(norm); + + /* Return zero vector for zero norm */ + if (norm > 0) + { + for (int i = 0; i < a->dim; i++) + rx[i] = ax[i] / norm; + + /* Check for overflow */ + for (int i = 0; i < a->dim; i++) + { + if (isinf(rx[i])) + float_overflow_error(); + } + } + + PG_RETURN_POINTER(result); +} + /* * Add vectors */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_add); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_add); Datum vector_add(PG_FUNCTION_ARGS) { @@ -732,7 +844,7 @@ vector_add(PG_FUNCTION_ARGS) /* * Subtract vectors */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_sub); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_sub); Datum vector_sub(PG_FUNCTION_ARGS) { @@ -762,17 +874,141 @@ vector_sub(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } +/* + * Multiply vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_mul); +Datum +vector_mul(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + float *ax = a->x; + float *bx = b->x; + Vector *result; + float *rx; + + CheckDims(a, b); + + result = InitVector(a->dim); + rx = result->x; + + /* Auto-vectorized */ + for (int i = 0, imax = a->dim; i < imax; i++) + rx[i] = ax[i] * bx[i]; + + /* Check for overflow and underflow */ + for (int i = 0, imax = a->dim; i < imax; i++) + { + if (isinf(rx[i])) + float_overflow_error(); + + if (rx[i] == 0 && !(ax[i] == 0 || bx[i] == 0)) + float_underflow_error(); + } + + PG_RETURN_POINTER(result); +} + +/* + * Concatenate vectors + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_concat); +Datum +vector_concat(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); + Vector *result; + int dim = a->dim + b->dim; + + CheckDim(dim); + result = InitVector(dim); + + for (int i = 0; i < a->dim; i++) + result->x[i] = a->x[i]; + + for (int i = 0; i < b->dim; i++) + result->x[i + a->dim] = b->x[i]; + + PG_RETURN_POINTER(result); +} + +/* + * Quantize a vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(binary_quantize); +Datum +binary_quantize(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + float *ax = a->x; + VarBit *result = InitBitVector(a->dim); + unsigned char *rx = VARBITS(result); + + for (int i = 0; i < a->dim; i++) + rx[i / 8] |= (ax[i] > 0) << (7 - (i % 8)); + + PG_RETURN_VARBIT_P(result); +} + +/* + * Get a subvector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(subvector); +Datum +subvector(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + int32 start = PG_GETARG_INT32(1); + int32 count = PG_GETARG_INT32(2); + int32 end; + float *ax = a->x; + Vector *result; + int dim; + + if (count < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("vector must have at least 1 dimension"))); + + /* + * Check if (start + count > a->dim), avoiding integer overflow. a->dim + * and count are both positive, so a->dim - count won't overflow. + */ + if (start > a->dim - count) + end = a->dim + 1; + else + end = start + count; + + /* Indexing starts at 1, like substring */ + if (start < 1) + start = 1; + else if (start > a->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("vector must have at least 1 dimension"))); + + dim = end - start; + CheckDim(dim); + result = InitVector(dim); + + for (int i = 0; i < dim; i++) + result->x[i] = ax[start - 1 + i]; + + PG_RETURN_POINTER(result); +} + /* * Internal helper to compare vectors */ int vector_cmp_internal(Vector * a, Vector * b) { - int i; + int dim = Min(a->dim, b->dim); - CheckDims(a, b); - - for (i = 0; i < a->dim; i++) + /* Check values before dimensions to be consistent with Postgres arrays */ + for (int i = 0; i < dim; i++) { if (a->x[i] < b->x[i]) return -1; @@ -780,18 +1016,25 @@ vector_cmp_internal(Vector * a, Vector * b) if (a->x[i] > b->x[i]) return 1; } + + if (a->dim < b->dim) + return -1; + + if (a->dim > b->dim) + return 1; + return 0; } /* * Less than */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_lt); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_lt); Datum vector_lt(PG_FUNCTION_ARGS) { - Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); - Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); PG_RETURN_BOOL(vector_cmp_internal(a, b) < 0); } @@ -799,12 +1042,12 @@ vector_lt(PG_FUNCTION_ARGS) /* * Less than or equal */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_le); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_le); Datum vector_le(PG_FUNCTION_ARGS) { - Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); - Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); PG_RETURN_BOOL(vector_cmp_internal(a, b) <= 0); } @@ -812,12 +1055,12 @@ vector_le(PG_FUNCTION_ARGS) /* * Equal */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_eq); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_eq); Datum vector_eq(PG_FUNCTION_ARGS) { - Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); - Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); PG_RETURN_BOOL(vector_cmp_internal(a, b) == 0); } @@ -825,12 +1068,12 @@ vector_eq(PG_FUNCTION_ARGS) /* * Not equal */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_ne); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_ne); Datum vector_ne(PG_FUNCTION_ARGS) { - Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); - Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); PG_RETURN_BOOL(vector_cmp_internal(a, b) != 0); } @@ -838,12 +1081,12 @@ vector_ne(PG_FUNCTION_ARGS) /* * Greater than or equal */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_ge); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_ge); Datum vector_ge(PG_FUNCTION_ARGS) { - Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); - Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); PG_RETURN_BOOL(vector_cmp_internal(a, b) >= 0); } @@ -851,12 +1094,12 @@ vector_ge(PG_FUNCTION_ARGS) /* * Greater than */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_gt); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_gt); Datum vector_gt(PG_FUNCTION_ARGS) { - Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); - Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); PG_RETURN_BOOL(vector_cmp_internal(a, b) > 0); } @@ -864,12 +1107,12 @@ vector_gt(PG_FUNCTION_ARGS) /* * Compare vectors */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_cmp); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_cmp); Datum vector_cmp(PG_FUNCTION_ARGS) { - Vector *a = (Vector *) PG_GETARG_VECTOR_P(0); - Vector *b = (Vector *) PG_GETARG_VECTOR_P(1); + Vector *a = PG_GETARG_VECTOR_P(0); + Vector *b = PG_GETARG_VECTOR_P(1); PG_RETURN_INT32(vector_cmp_internal(a, b)); } @@ -877,7 +1120,7 @@ vector_cmp(PG_FUNCTION_ARGS) /* * Accumulate vectors */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_accum); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_accum); Datum vector_accum(PG_FUNCTION_ARGS) { @@ -936,12 +1179,13 @@ vector_accum(PG_FUNCTION_ARGS) } /* - * Combine vectors + * Combine vectors or half vectors (also used for halfvec_combine) */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_combine); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_combine); Datum vector_combine(PG_FUNCTION_ARGS) { + /* Must also update parameters of halfvec_combine if modifying */ ArrayType *statearray1 = PG_GETARG_ARRAYTYPE_P(0); ArrayType *statearray2 = PG_GETARG_ARRAYTYPE_P(1); float8 *statevalues1; @@ -1008,7 +1252,7 @@ vector_combine(PG_FUNCTION_ARGS) /* * Average vectors */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_avg); +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(vector_avg); Datum vector_avg(PG_FUNCTION_ARGS) { @@ -1038,3 +1282,26 @@ vector_avg(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } + +/* + * Convert sparse vector to dense vector + */ +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(sparsevec_to_vector); +Datum +sparsevec_to_vector(PG_FUNCTION_ARGS) +{ + SparseVector *svec = PG_GETARG_SPARSEVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + Vector *result; + int dim = svec->dim; + float *values = SPARSEVEC_VALUES(svec); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + result = InitVector(dim); + for (int i = 0; i < svec->nnz; i++) + result->x[svec->indices[i]] = values[i]; + + PG_RETURN_POINTER(result); +} diff --git a/src/postgres/third-party-extensions/pgvector/src/vector.h b/src/postgres/third-party-extensions/pgvector/src/vector.h index 93aeb6a6be19..e29ccdc6a924 100644 --- a/src/postgres/third-party-extensions/pgvector/src/vector.h +++ b/src/postgres/third-party-extensions/pgvector/src/vector.h @@ -1,12 +1,6 @@ #ifndef VECTOR_H #define VECTOR_H -#include "postgres.h" - -#if PG_VERSION_NUM >= 160000 -#include "varatt.h" -#endif - #define VECTOR_MAX_DIM 16000 #define VECTOR_SIZE(_dim) (offsetof(Vector, x) + sizeof(float)*(_dim)) @@ -18,28 +12,19 @@ typedef struct Vector { int32 vl_len_; /* varlena header (do not touch directly!) */ int16 dim; /* number of dimensions */ - int16 unused; + int16 unused; /* reserved for future use, always zero */ float x[FLEXIBLE_ARRAY_MEMBER]; } Vector; +Vector *InitVector(int dim); void PrintVector(char *msg, Vector * vector); int vector_cmp_internal(Vector * a, Vector * b); -/* - * Allocate and initialize a new vector - */ -static inline Vector * -InitVector(int dim) -{ - Vector *result; - int size; - - size = VECTOR_SIZE(dim); - result = (Vector *) palloc0(size); - SET_VARSIZE(result, size); - result->dim = dim; - - return result; -} +/* TODO Move to better place */ +#if PG_VERSION_NUM >= 160000 +#define FUNCTION_PREFIX +#else +#define FUNCTION_PREFIX PGDLLEXPORT +#endif #endif diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/bit.out b/src/postgres/third-party-extensions/pgvector/test/expected/bit.out new file mode 100644 index 000000000000..a9ba47255790 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/expected/bit.out @@ -0,0 +1,140 @@ +SELECT hamming_distance('111', '111'); + hamming_distance +------------------ + 0 +(1 row) + +SELECT hamming_distance('111', '110'); + hamming_distance +------------------ + 1 +(1 row) + +SELECT hamming_distance('111', '100'); + hamming_distance +------------------ + 2 +(1 row) + +SELECT hamming_distance('111', '000'); + hamming_distance +------------------ + 3 +(1 row) + +SELECT hamming_distance('10101010101010101010', '01010101010101010101'); + hamming_distance +------------------ + 20 +(1 row) + +SELECT hamming_distance('101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101', '101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101'); + hamming_distance +------------------ + 0 +(1 row) + +SELECT hamming_distance('101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101', '010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010'); + hamming_distance +------------------ + 513 +(1 row) + +SELECT hamming_distance('110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011', '100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001'); + hamming_distance +------------------ + 2 +(1 row) + +SELECT hamming_distance('', ''); + hamming_distance +------------------ + 0 +(1 row) + +SELECT hamming_distance('111', '00'); +ERROR: different bit lengths 3 and 2 +SELECT hamming_distance('111', '000'::varbit(4)); + hamming_distance +------------------ + 3 +(1 row) + +SELECT hamming_distance('111', '0000'::varbit(4)); +ERROR: different bit lengths 3 and 4 +SELECT jaccard_distance('1111', '1111'); + jaccard_distance +------------------ + 0 +(1 row) + +SELECT jaccard_distance('1111', '1110'); + jaccard_distance +------------------ + 0.25 +(1 row) + +SELECT jaccard_distance('1111', '1100'); + jaccard_distance +------------------ + 0.5 +(1 row) + +SELECT jaccard_distance('1111', '1000'); + jaccard_distance +------------------ + 0.75 +(1 row) + +SELECT jaccard_distance('1111', '0000'); + jaccard_distance +------------------ + 1 +(1 row) + +SELECT jaccard_distance('1100', '1000'); + jaccard_distance +------------------ + 0.5 +(1 row) + +SELECT jaccard_distance('10101010101010101010', '01010101010101010101'); + jaccard_distance +------------------ + 1 +(1 row) + +SELECT jaccard_distance('101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101', '101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101'); + jaccard_distance +------------------ + 0 +(1 row) + +SELECT jaccard_distance('101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101', '010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010'); + jaccard_distance +------------------ + 1 +(1 row) + +SELECT jaccard_distance('110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011', '100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001'); + jaccard_distance +------------------ + 0.5 +(1 row) + +SELECT jaccard_distance('', ''); + jaccard_distance +------------------ + 1 +(1 row) + +SELECT jaccard_distance('1111', '000'); +ERROR: different bit lengths 4 and 3 +SELECT jaccard_distance('1111', '0000'::varbit(5)); + jaccard_distance +------------------ + 1 +(1 row) + +SELECT jaccard_distance('1111', '00000'::varbit(5)); +ERROR: different bit lengths 4 and 5 diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/btree.out b/src/postgres/third-party-extensions/pgvector/test/expected/btree.out index d8b6da5690e9..999a1608f1ac 100644 --- a/src/postgres/third-party-extensions/pgvector/test/expected/btree.out +++ b/src/postgres/third-party-extensions/pgvector/test/expected/btree.out @@ -1,4 +1,5 @@ SET enable_seqscan = off; +-- vector CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE INDEX ON t (val); @@ -8,10 +9,53 @@ SELECT * FROM t WHERE val = '[1,2,3]'; [1,2,3] (1 row) -SELECT * FROM t ORDER BY val LIMIT 1; +SELECT * FROM t ORDER BY val; val --------- [0,0,0] + [1,1,1] + [1,2,3] + +(4 rows) + +DROP TABLE t; +-- halfvec +CREATE TABLE t (val halfvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t (val); +SELECT * FROM t WHERE val = '[1,2,3]'; + val +--------- + [1,2,3] (1 row) +SELECT * FROM t ORDER BY val; + val +--------- + [0,0,0] + [1,1,1] + [1,2,3] + +(4 rows) + +DROP TABLE t; +-- sparsevec +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{1:1,2:2,3:3}/3'), ('{1:1,2:1,3:1}/3'), (NULL); +CREATE INDEX ON t (val); +SELECT * FROM t WHERE val = '{1:1,2:2,3:3}/3'; + val +----------------- + {1:1,2:2,3:3}/3 +(1 row) + +SELECT * FROM t ORDER BY val; + val +----------------- + {}/3 + {1:1,2:1,3:1}/3 + {1:1,2:2,3:3}/3 + +(4 rows) + DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/cast.out b/src/postgres/third-party-extensions/pgvector/test/expected/cast.out index 37614d9338aa..c180fe621639 100644 --- a/src/postgres/third-party-extensions/pgvector/test/expected/cast.out +++ b/src/postgres/third-party-extensions/pgvector/test/expected/cast.out @@ -28,8 +28,28 @@ SELECT ARRAY[1,2,3]::numeric[]::vector; [1,2,3] (1 row) +SELECT '[1,2,3]'::vector::real[]; + float4 +--------- + {1,2,3} +(1 row) + +SELECT '{1,2,3}'::real[]::vector; + vector +--------- + [1,2,3] +(1 row) + +SELECT '{1,2,3}'::real[]::vector(3); + vector +--------- + [1,2,3] +(1 row) + +SELECT '{1,2,3}'::real[]::vector(2); +ERROR: expected 2 dimensions, not 3 SELECT '{NULL}'::real[]::vector; -ERROR: array must not containing NULLs +ERROR: array must not contain nulls SELECT '{NaN}'::real[]::vector; ERROR: NaN not allowed in vector SELECT '{Infinity}'::real[]::vector; @@ -38,12 +58,212 @@ SELECT '{-Infinity}'::real[]::vector; ERROR: infinite value not allowed in vector SELECT '{}'::real[]::vector; ERROR: vector must have at least 1 dimension -SELECT '[1,2,3]'::vector::real[]; - float4 +SELECT '{{1}}'::real[]::vector; +ERROR: array must be 1-D +SELECT '{1,2,3}'::double precision[]::vector; + vector --------- - {1,2,3} + [1,2,3] +(1 row) + +SELECT '{1,2,3}'::double precision[]::vector(3); + vector +--------- + [1,2,3] +(1 row) + +SELECT '{1,2,3}'::double precision[]::vector(2); +ERROR: expected 2 dimensions, not 3 +SELECT '{4e38,-4e38}'::double precision[]::vector; +ERROR: infinite value not allowed in vector +SELECT '{1e-46,-1e-46}'::double precision[]::vector; + vector +-------- + [0,-0] +(1 row) + +SELECT '[1,2,3]'::vector::halfvec; + halfvec +--------- + [1,2,3] +(1 row) + +SELECT '[1,2,3]'::vector::halfvec(3); + halfvec +--------- + [1,2,3] +(1 row) + +SELECT '[1,2,3]'::vector::halfvec(2); +ERROR: expected 2 dimensions, not 3 +SELECT '[65520]'::vector::halfvec; +ERROR: "65520" is out of range for type halfvec +SELECT '[1e-8]'::vector::halfvec; + halfvec +--------- + [0] +(1 row) + +SELECT '[1,2,3]'::halfvec::vector; + vector +--------- + [1,2,3] +(1 row) + +SELECT '[1,2,3]'::halfvec::vector(3); + vector +--------- + [1,2,3] +(1 row) + +SELECT '[1,2,3]'::halfvec::vector(2); +ERROR: expected 2 dimensions, not 3 +SELECT '{1,2,3}'::real[]::halfvec; + halfvec +--------- + [1,2,3] +(1 row) + +SELECT '{1,2,3}'::real[]::halfvec(3); + halfvec +--------- + [1,2,3] +(1 row) + +SELECT '{1,2,3}'::real[]::halfvec(2); +ERROR: expected 2 dimensions, not 3 +SELECT '{65520,-65520}'::real[]::halfvec; +ERROR: "65520" is out of range for type halfvec +SELECT '{1e-8,-1e-8}'::real[]::halfvec; + halfvec +--------- + [0,-0] +(1 row) + +SELECT '[0,1.5,0,3.5,0]'::vector::sparsevec; + sparsevec +----------------- + {2:1.5,4:3.5}/5 +(1 row) + +SELECT '[0,1.5,0,3.5,0]'::vector::sparsevec(5); + sparsevec +----------------- + {2:1.5,4:3.5}/5 +(1 row) + +SELECT '[0,1.5,0,3.5,0]'::vector::sparsevec(4); +ERROR: expected 4 dimensions, not 5 +SELECT '{2:1.5,4:3.5}/5'::sparsevec::vector; + vector +----------------- + [0,1.5,0,3.5,0] +(1 row) + +SELECT '{2:1.5,4:3.5}/5'::sparsevec::vector(5); + vector +----------------- + [0,1.5,0,3.5,0] +(1 row) + +SELECT '{2:1.5,4:3.5}/5'::sparsevec::vector(4); +ERROR: expected 4 dimensions, not 5 +SELECT '{}/16001'::sparsevec::vector; +ERROR: vector cannot have more than 16000 dimensions +SELECT '[0,1.5,0,3.5,0]'::halfvec::sparsevec; + sparsevec +----------------- + {2:1.5,4:3.5}/5 +(1 row) + +SELECT '[0,1.5,0,3.5,0]'::halfvec::sparsevec(5); + sparsevec +----------------- + {2:1.5,4:3.5}/5 +(1 row) + +SELECT '[0,1.5,0,3.5,0]'::halfvec::sparsevec(4); +ERROR: expected 4 dimensions, not 5 +SELECT '{2:1.5,4:3.5}/5'::sparsevec::halfvec; + halfvec +----------------- + [0,1.5,0,3.5,0] +(1 row) + +SELECT '{2:1.5,4:3.5}/5'::sparsevec::halfvec(5); + halfvec +----------------- + [0,1.5,0,3.5,0] +(1 row) + +SELECT '{2:1.5,4:3.5}/5'::sparsevec::halfvec(4); +ERROR: expected 4 dimensions, not 5 +SELECT '{}/16001'::sparsevec::halfvec; +ERROR: halfvec cannot have more than 16000 dimensions +SELECT '{1:65520}/1'::sparsevec::halfvec; +ERROR: "65520" is out of range for type halfvec +SELECT '{1:1e-8}/1'::sparsevec::halfvec; + halfvec +--------- + [0] +(1 row) + +SELECT ARRAY[1,0,2,0,3,0]::sparsevec; + array +----------------- + {1:1,3:2,5:3}/6 +(1 row) + +SELECT ARRAY[1.0,0.0,2.0,0.0,3.0,0.0]::sparsevec; + array +----------------- + {1:1,3:2,5:3}/6 +(1 row) + +SELECT ARRAY[1,0,2,0,3,0]::float4[]::sparsevec; + array +----------------- + {1:1,3:2,5:3}/6 +(1 row) + +SELECT ARRAY[1,0,2,0,3,0]::float8[]::sparsevec; + array +----------------- + {1:1,3:2,5:3}/6 +(1 row) + +SELECT ARRAY[1,0,2,0,3,0]::numeric[]::sparsevec; + array +----------------- + {1:1,3:2,5:3}/6 +(1 row) + +SELECT '{1,0,2,0,3,0}'::real[]::sparsevec; + sparsevec +----------------- + {1:1,3:2,5:3}/6 +(1 row) + +SELECT '{1,0,2,0,3,0}'::real[]::sparsevec(6); + sparsevec +----------------- + {1:1,3:2,5:3}/6 (1 row) +SELECT '{1,0,2,0,3,0}'::real[]::sparsevec(5); +ERROR: expected 5 dimensions, not 6 +SELECT '{NULL}'::real[]::sparsevec; +ERROR: array must not contain nulls +SELECT '{NaN}'::real[]::sparsevec; +ERROR: NaN not allowed in sparsevec +SELECT '{Infinity}'::real[]::sparsevec; +ERROR: infinite value not allowed in sparsevec +SELECT '{-Infinity}'::real[]::sparsevec; +ERROR: infinite value not allowed in sparsevec +SELECT '{}'::real[]::sparsevec; +ERROR: sparsevec must have at least 1 dimension +SELECT '{{1}}'::real[]::sparsevec; +ERROR: array must be 1-D SELECT array_agg(n)::vector FROM generate_series(1, 16001) n; ERROR: vector cannot have more than 16000 dimensions SELECT array_to_vector(array_agg(n), 16001, false) FROM generate_series(1, 16001) n; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/copy.out b/src/postgres/third-party-extensions/pgvector/test/expected/copy.out index 36d4620db31e..9b4ebc088b79 100644 --- a/src/postgres/third-party-extensions/pgvector/test/expected/copy.out +++ b/src/postgres/third-party-extensions/pgvector/test/expected/copy.out @@ -1,8 +1,9 @@ +-- vector CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE TABLE t2 (val vector(3)); -\copy t TO 'results/data.bin' WITH (FORMAT binary) -\copy t2 FROM 'results/data.bin' WITH (FORMAT binary) +\copy t TO 'results/vector.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/vector.bin' WITH (FORMAT binary) SELECT * FROM t2 ORDER BY val; val --------- @@ -14,3 +15,37 @@ SELECT * FROM t2 ORDER BY val; DROP TABLE t; DROP TABLE t2; +-- halfvec +CREATE TABLE t (val halfvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE TABLE t2 (val halfvec(3)); +\copy t TO 'results/halfvec.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/halfvec.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +--------- + [0,0,0] + [1,1,1] + [1,2,3] + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; +-- sparsevec +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{1:1,2:2,3:3}/3'), ('{1:1,2:1,3:1}/3'), (NULL); +CREATE TABLE t2 (val sparsevec(3)); +\copy t TO 'results/sparsevec.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/sparsevec.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +----------------- + {}/3 + {1:1,2:1,3:1}/3 + {1:1,2:2,3:3}/3 + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/functions.out b/src/postgres/third-party-extensions/pgvector/test/expected/functions.out deleted file mode 100644 index 0272282f2a45..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/expected/functions.out +++ /dev/null @@ -1,110 +0,0 @@ -SELECT '[1,2,3]'::vector + '[4,5,6]'; - ?column? ----------- - [5,7,9] -(1 row) - -SELECT '[3e38]'::vector + '[3e38]'; -ERROR: value out of range: overflow -SELECT '[1,2,3]'::vector - '[4,5,6]'; - ?column? ------------- - [-3,-3,-3] -(1 row) - -SELECT '[-3e38]'::vector - '[3e38]'; -ERROR: value out of range: overflow -SELECT vector_dims('[1,2,3]'); - vector_dims -------------- - 3 -(1 row) - -SELECT round(vector_norm('[1,1]')::numeric, 5); - round ---------- - 1.41421 -(1 row) - -SELECT vector_norm('[3,4]'); - vector_norm -------------- - 5 -(1 row) - -SELECT vector_norm('[0,1]'); - vector_norm -------------- - 1 -(1 row) - -SELECT l2_distance('[0,0]', '[3,4]'); - l2_distance -------------- - 5 -(1 row) - -SELECT l2_distance('[0,0]', '[0,1]'); - l2_distance -------------- - 1 -(1 row) - -SELECT l2_distance('[1,2]', '[3]'); -ERROR: different vector dimensions 2 and 1 -SELECT inner_product('[1,2]', '[3,4]'); - inner_product ---------------- - 11 -(1 row) - -SELECT inner_product('[1,2]', '[3]'); -ERROR: different vector dimensions 2 and 1 -SELECT cosine_distance('[1,2]', '[2,4]'); - cosine_distance ------------------ - 0 -(1 row) - -SELECT cosine_distance('[1,2]', '[0,0]'); - cosine_distance ------------------ - NaN -(1 row) - -SELECT cosine_distance('[1,1]', '[1,1]'); - cosine_distance ------------------ - 0 -(1 row) - -SELECT cosine_distance('[1,1]', '[-1,-1]'); - cosine_distance ------------------ - 2 -(1 row) - -SELECT cosine_distance('[1,2]', '[3]'); -ERROR: different vector dimensions 2 and 1 -SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; - avg ------------ - [2,3.5,5] -(1 row) - -SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; - avg ------------ - [2,3.5,5] -(1 row) - -SELECT avg(v) FROM unnest(ARRAY[]::vector[]) v; - avg ------ - -(1 row) - -SELECT avg(v) FROM unnest(ARRAY['[1,2]'::vector, '[3]']) v; -ERROR: expected 2 dimensions, not 1 -SELECT vector_avg(array_agg(n)) FROM generate_series(1, 16002) n; -ERROR: vector cannot have more than 16000 dimensions diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/halfvec.out b/src/postgres/third-party-extensions/pgvector/test/expected/halfvec.out new file mode 100644 index 000000000000..a3ce8931f4f1 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/expected/halfvec.out @@ -0,0 +1,636 @@ +SELECT '[1,2,3]'::halfvec; + halfvec +--------- + [1,2,3] +(1 row) + +SELECT '[-1,-2,-3]'::halfvec; + halfvec +------------ + [-1,-2,-3] +(1 row) + +SELECT '[1.,2.,3.]'::halfvec; + halfvec +--------- + [1,2,3] +(1 row) + +SELECT ' [ 1, 2 , 3 ] '::halfvec; + halfvec +--------- + [1,2,3] +(1 row) + +SELECT '[1.23456]'::halfvec; + halfvec +------------ + [1.234375] +(1 row) + +SELECT '[hello,1]'::halfvec; +ERROR: invalid input syntax for type halfvec: "[hello,1]" +LINE 1: SELECT '[hello,1]'::halfvec; + ^ +SELECT '[NaN,1]'::halfvec; +ERROR: NaN not allowed in halfvec +LINE 1: SELECT '[NaN,1]'::halfvec; + ^ +SELECT '[Infinity,1]'::halfvec; +ERROR: infinite value not allowed in halfvec +LINE 1: SELECT '[Infinity,1]'::halfvec; + ^ +SELECT '[-Infinity,1]'::halfvec; +ERROR: infinite value not allowed in halfvec +LINE 1: SELECT '[-Infinity,1]'::halfvec; + ^ +SELECT '[65519,-65519]'::halfvec; + halfvec +---------------- + [65504,-65504] +(1 row) + +SELECT '[65520,-65520]'::halfvec; +ERROR: "65520" is out of range for type halfvec +LINE 1: SELECT '[65520,-65520]'::halfvec; + ^ +SELECT '[1e-8,-1e-8]'::halfvec; + halfvec +--------- + [0,-0] +(1 row) + +SELECT '[4e38,1]'::halfvec; +ERROR: "4e38" is out of range for type halfvec +LINE 1: SELECT '[4e38,1]'::halfvec; + ^ +SELECT '[1e-46,1]'::halfvec; + halfvec +--------- + [0,1] +(1 row) + +SELECT '[1,2,3'::halfvec; +ERROR: invalid input syntax for type halfvec: "[1,2,3" +LINE 1: SELECT '[1,2,3'::halfvec; + ^ +SELECT '[1,2,3]9'::halfvec; +ERROR: invalid input syntax for type halfvec: "[1,2,3]9" +LINE 1: SELECT '[1,2,3]9'::halfvec; + ^ +DETAIL: Junk after closing right brace. +SELECT '1,2,3'::halfvec; +ERROR: invalid input syntax for type halfvec: "1,2,3" +LINE 1: SELECT '1,2,3'::halfvec; + ^ +DETAIL: Vector contents must start with "[". +SELECT ''::halfvec; +ERROR: invalid input syntax for type halfvec: "" +LINE 1: SELECT ''::halfvec; + ^ +DETAIL: Vector contents must start with "[". +SELECT '['::halfvec; +ERROR: invalid input syntax for type halfvec: "[" +LINE 1: SELECT '['::halfvec; + ^ +SELECT '[ '::halfvec; +ERROR: invalid input syntax for type halfvec: "[ " +LINE 1: SELECT '[ '::halfvec; + ^ +SELECT '[,'::halfvec; +ERROR: invalid input syntax for type halfvec: "[," +LINE 1: SELECT '[,'::halfvec; + ^ +SELECT '[]'::halfvec; +ERROR: halfvec must have at least 1 dimension +LINE 1: SELECT '[]'::halfvec; + ^ +SELECT '[ ]'::halfvec; +ERROR: halfvec must have at least 1 dimension +LINE 1: SELECT '[ ]'::halfvec; + ^ +SELECT '[,]'::halfvec; +ERROR: invalid input syntax for type halfvec: "[,]" +LINE 1: SELECT '[,]'::halfvec; + ^ +SELECT '[1,]'::halfvec; +ERROR: invalid input syntax for type halfvec: "[1,]" +LINE 1: SELECT '[1,]'::halfvec; + ^ +SELECT '[1a]'::halfvec; +ERROR: invalid input syntax for type halfvec: "[1a]" +LINE 1: SELECT '[1a]'::halfvec; + ^ +SELECT '[1,,3]'::halfvec; +ERROR: invalid input syntax for type halfvec: "[1,,3]" +LINE 1: SELECT '[1,,3]'::halfvec; + ^ +SELECT '[1, ,3]'::halfvec; +ERROR: invalid input syntax for type halfvec: "[1, ,3]" +LINE 1: SELECT '[1, ,3]'::halfvec; + ^ +SELECT '[1,2,3]'::halfvec(3); + halfvec +--------- + [1,2,3] +(1 row) + +SELECT '[1,2,3]'::halfvec(2); +ERROR: expected 2 dimensions, not 3 +SELECT '[1,2,3]'::halfvec(3, 2); +ERROR: invalid type modifier +LINE 1: SELECT '[1,2,3]'::halfvec(3, 2); + ^ +SELECT '[1,2,3]'::halfvec('a'); +ERROR: invalid input syntax for type integer: "a" +LINE 1: SELECT '[1,2,3]'::halfvec('a'); + ^ +SELECT '[1,2,3]'::halfvec(0); +ERROR: dimensions for type halfvec must be at least 1 +LINE 1: SELECT '[1,2,3]'::halfvec(0); + ^ +SELECT '[1,2,3]'::halfvec(16001); +ERROR: dimensions for type halfvec cannot exceed 16000 +LINE 1: SELECT '[1,2,3]'::halfvec(16001); + ^ +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::halfvec[]); + unnest +--------- + [1,2,3] + [4,5,6] +(2 rows) + +SELECT '{"[1,2,3]"}'::halfvec(2)[]; +ERROR: expected 2 dimensions, not 3 +SELECT '[1,2,3]'::halfvec + '[4,5,6]'; + ?column? +---------- + [5,7,9] +(1 row) + +SELECT '[65519]'::halfvec + '[65519]'; +ERROR: value out of range: overflow +SELECT '[1,2]'::halfvec + '[3]'; +ERROR: different halfvec dimensions 2 and 1 +SELECT '[1,2,3]'::halfvec - '[4,5,6]'; + ?column? +------------ + [-3,-3,-3] +(1 row) + +SELECT '[-65519]'::halfvec - '[65519]'; +ERROR: value out of range: overflow +SELECT '[1,2]'::halfvec - '[3]'; +ERROR: different halfvec dimensions 2 and 1 +SELECT '[1,2,3]'::halfvec * '[4,5,6]'; + ?column? +----------- + [4,10,18] +(1 row) + +SELECT '[65519]'::halfvec * '[65519]'; +ERROR: value out of range: overflow +SELECT '[1e-7]'::halfvec * '[1e-7]'; +ERROR: value out of range: underflow +SELECT '[1,2]'::halfvec * '[3]'; +ERROR: different halfvec dimensions 2 and 1 +SELECT '[1,2,3]'::halfvec || '[4,5]'; + ?column? +------------- + [1,2,3,4,5] +(1 row) + +SELECT array_fill(0, ARRAY[16000])::halfvec || '[1]'; +ERROR: halfvec cannot have more than 16000 dimensions +SELECT '[1,2,3]'::halfvec < '[1,2,3]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::halfvec < '[1,2]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::halfvec <= '[1,2,3]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::halfvec <= '[1,2]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::halfvec = '[1,2,3]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::halfvec = '[1,2]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::halfvec != '[1,2,3]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::halfvec != '[1,2]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::halfvec >= '[1,2,3]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::halfvec >= '[1,2]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::halfvec > '[1,2,3]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::halfvec > '[1,2]'; + ?column? +---------- + t +(1 row) + +SELECT halfvec_cmp('[1,2,3]', '[1,2,3]'); + halfvec_cmp +------------- + 0 +(1 row) + +SELECT halfvec_cmp('[1,2,3]', '[0,0,0]'); + halfvec_cmp +------------- + 1 +(1 row) + +SELECT halfvec_cmp('[0,0,0]', '[1,2,3]'); + halfvec_cmp +------------- + -1 +(1 row) + +SELECT halfvec_cmp('[1,2]', '[1,2,3]'); + halfvec_cmp +------------- + -1 +(1 row) + +SELECT halfvec_cmp('[1,2,3]', '[1,2]'); + halfvec_cmp +------------- + 1 +(1 row) + +SELECT halfvec_cmp('[1,2]', '[2,3,4]'); + halfvec_cmp +------------- + -1 +(1 row) + +SELECT halfvec_cmp('[2,3]', '[1,2,3]'); + halfvec_cmp +------------- + 1 +(1 row) + +SELECT vector_dims('[1,2,3]'::halfvec); + vector_dims +------------- + 3 +(1 row) + +SELECT round(l2_norm('[1,1]'::halfvec)::numeric, 5); + round +--------- + 1.41421 +(1 row) + +SELECT l2_norm('[3,4]'::halfvec); + l2_norm +--------- + 5 +(1 row) + +SELECT l2_norm('[0,1]'::halfvec); + l2_norm +--------- + 1 +(1 row) + +SELECT l2_norm('[0,0]'::halfvec); + l2_norm +--------- + 0 +(1 row) + +SELECT l2_norm('[2]'::halfvec); + l2_norm +--------- + 2 +(1 row) + +SELECT l2_distance('[0,0]'::halfvec, '[3,4]'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('[0,0]'::halfvec, '[0,1]'); + l2_distance +------------- + 1 +(1 row) + +SELECT l2_distance('[1,2]'::halfvec, '[3]'); +ERROR: different halfvec dimensions 2 and 1 +SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]'); + l2_distance +------------- + 5 +(1 row) + +SELECT '[0,0]'::halfvec <-> '[3,4]'; + ?column? +---------- + 5 +(1 row) + +SELECT inner_product('[1,2]'::halfvec, '[3,4]'); + inner_product +--------------- + 11 +(1 row) + +SELECT inner_product('[1,2]'::halfvec, '[3]'); +ERROR: different halfvec dimensions 2 and 1 +SELECT inner_product('[65504]'::halfvec, '[65504]'); + inner_product +--------------- + 4290774016 +(1 row) + +SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); + inner_product +--------------- + 45 +(1 row) + +SELECT '[1,2]'::halfvec <#> '[3,4]'; + ?column? +---------- + -11 +(1 row) + +SELECT cosine_distance('[1,2]'::halfvec, '[2,4]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2]'::halfvec, '[0,0]'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('[1,1]'::halfvec, '[1,1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,0]'::halfvec, '[0,2]'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[1,2]'::halfvec, '[3]'); +ERROR: different halfvec dimensions 2 and 1 +SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT '[1,2]'::halfvec <=> '[2,4]'; + ?column? +---------- + 0 +(1 row) + +SELECT l1_distance('[0,0]'::halfvec, '[3,4]'); + l1_distance +------------- + 7 +(1 row) + +SELECT l1_distance('[0,0]'::halfvec, '[0,1]'); + l1_distance +------------- + 1 +(1 row) + +SELECT l1_distance('[1,2]'::halfvec, '[3]'); +ERROR: different halfvec dimensions 2 and 1 +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); + l1_distance +------------- + 0 +(1 row) + +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]'); + l1_distance +------------- + 9 +(1 row) + +SELECT '[0,0]'::halfvec <+> '[3,4]'; + ?column? +---------- + 7 +(1 row) + +SELECT l2_normalize('[3,4]'::halfvec); + l2_normalize +------------------------ + [0.60009766,0.7998047] +(1 row) + +SELECT l2_normalize('[3,0]'::halfvec); + l2_normalize +-------------- + [1,0] +(1 row) + +SELECT l2_normalize('[0,0.1]'::halfvec); + l2_normalize +-------------- + [0,1] +(1 row) + +SELECT l2_normalize('[0,0]'::halfvec); + l2_normalize +-------------- + [0,0] +(1 row) + +SELECT l2_normalize('[65504]'::halfvec); + l2_normalize +-------------- + [1] +(1 row) + +SELECT binary_quantize('[1,0,-1]'::halfvec); + binary_quantize +----------------- + 100 +(1 row) + +SELECT binary_quantize('[0,0.1,-0.2,-0.3,0.4,0.5,0.6,-0.7,0.8,-0.9,1]'::halfvec); + binary_quantize +----------------- + 01001110101 +(1 row) + +SELECT subvector('[1,2,3,4,5]'::halfvec, 1, 3); + subvector +----------- + [1,2,3] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::halfvec, 3, 2); + subvector +----------- + [3,4] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::halfvec, -1, 3); + subvector +----------- + [1] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::halfvec, 3, 9); + subvector +----------- + [3,4,5] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::halfvec, 1, 0); +ERROR: halfvec must have at least 1 dimension +SELECT subvector('[1,2,3,4,5]'::halfvec, 3, -1); +ERROR: halfvec must have at least 1 dimension +SELECT subvector('[1,2,3,4,5]'::halfvec, -1, 2); +ERROR: halfvec must have at least 1 dimension +SELECT subvector('[1,2,3,4,5]'::halfvec, 2147483647, 10); +ERROR: halfvec must have at least 1 dimension +SELECT subvector('[1,2,3,4,5]'::halfvec, 3, 2147483647); + subvector +----------- + [3,4,5] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::halfvec, -2147483644, 2147483647); + subvector +----------- + [1,2] +(1 row) + +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::halfvec, '[3,5,7]']) v; + avg +----------- + [2,3.5,5] +(1 row) + +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::halfvec, '[3,5,7]', NULL]) v; + avg +----------- + [2,3.5,5] +(1 row) + +SELECT avg(v) FROM unnest(ARRAY[]::halfvec[]) v; + avg +----- + +(1 row) + +SELECT avg(v) FROM unnest(ARRAY['[1,2]'::halfvec, '[3]']) v; +ERROR: expected 2 dimensions, not 1 +SELECT avg(v) FROM unnest(ARRAY['[65504]'::halfvec, '[65504]']) v; + avg +--------- + [65504] +(1 row) + +SELECT halfvec_avg(array_agg(n)) FROM generate_series(1, 16002) n; +ERROR: halfvec cannot have more than 16000 dimensions +SELECT sum(v) FROM unnest(ARRAY['[1,2,3]'::halfvec, '[3,5,7]']) v; + sum +---------- + [4,7,10] +(1 row) + +SELECT sum(v) FROM unnest(ARRAY['[1,2,3]'::halfvec, '[3,5,7]', NULL]) v; + sum +---------- + [4,7,10] +(1 row) + +SELECT sum(v) FROM unnest(ARRAY[]::halfvec[]) v; + sum +----- + +(1 row) + +SELECT sum(v) FROM unnest(ARRAY['[1,2]'::halfvec, '[3]']) v; +ERROR: different halfvec dimensions 2 and 1 +SELECT sum(v) FROM unnest(ARRAY['[65504]'::halfvec, '[65504]']) v; +ERROR: value out of range: overflow diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/input.out b/src/postgres/third-party-extensions/pgvector/test/expected/input.out deleted file mode 100644 index 19ef74d2354e..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/expected/input.out +++ /dev/null @@ -1,124 +0,0 @@ -SELECT '[1,2,3]'::vector; - vector ---------- - [1,2,3] -(1 row) - -SELECT '[-1,-2,-3]'::vector; - vector ------------- - [-1,-2,-3] -(1 row) - -SELECT '[1.,2.,3.]'::vector; - vector ---------- - [1,2,3] -(1 row) - -SELECT ' [ 1, 2 , 3 ] '::vector; - vector ---------- - [1,2,3] -(1 row) - -SELECT '[1.23456]'::vector; - vector ------------ - [1.23456] -(1 row) - -SELECT '[hello,1]'::vector; -ERROR: invalid input syntax for type vector: "[hello,1]" -LINE 1: SELECT '[hello,1]'::vector; - ^ -SELECT '[NaN,1]'::vector; -ERROR: NaN not allowed in vector -LINE 1: SELECT '[NaN,1]'::vector; - ^ -SELECT '[Infinity,1]'::vector; -ERROR: infinite value not allowed in vector -LINE 1: SELECT '[Infinity,1]'::vector; - ^ -SELECT '[-Infinity,1]'::vector; -ERROR: infinite value not allowed in vector -LINE 1: SELECT '[-Infinity,1]'::vector; - ^ -SELECT '[1.5e38,-1.5e38]'::vector; - vector --------------------- - [1.5e+38,-1.5e+38] -(1 row) - -SELECT '[1.5e+38,-1.5e+38]'::vector; - vector --------------------- - [1.5e+38,-1.5e+38] -(1 row) - -SELECT '[1.5e-38,-1.5e-38]'::vector; - vector --------------------- - [1.5e-38,-1.5e-38] -(1 row) - -SELECT '[4e38,1]'::vector; -ERROR: infinite value not allowed in vector -LINE 1: SELECT '[4e38,1]'::vector; - ^ -SELECT '[1,2,3'::vector; -ERROR: malformed vector literal: "[1,2,3" -LINE 1: SELECT '[1,2,3'::vector; - ^ -DETAIL: Unexpected end of input. -SELECT '[1,2,3]9'::vector; -ERROR: malformed vector literal: "[1,2,3]9" -LINE 1: SELECT '[1,2,3]9'::vector; - ^ -DETAIL: Junk after closing right brace. -SELECT '1,2,3'::vector; -ERROR: malformed vector literal: "1,2,3" -LINE 1: SELECT '1,2,3'::vector; - ^ -DETAIL: Vector contents must start with "[". -SELECT '['::vector; -ERROR: malformed vector literal: "[" -LINE 1: SELECT '['::vector; - ^ -DETAIL: Unexpected end of input. -SELECT '[,'::vector; -ERROR: malformed vector literal: "[," -LINE 1: SELECT '[,'::vector; - ^ -DETAIL: Unexpected end of input. -SELECT '[]'::vector; -ERROR: vector must have at least 1 dimension -LINE 1: SELECT '[]'::vector; - ^ -SELECT '[1,]'::vector; -ERROR: invalid input syntax for type vector: "[1,]" -LINE 1: SELECT '[1,]'::vector; - ^ -SELECT '[1a]'::vector; -ERROR: invalid input syntax for type vector: "[1a]" -LINE 1: SELECT '[1a]'::vector; - ^ -SELECT '[1,,3]'::vector; -ERROR: malformed vector literal: "[1,,3]" -LINE 1: SELECT '[1,,3]'::vector; - ^ -SELECT '[1, ,3]'::vector; -ERROR: invalid input syntax for type vector: "[1, ,3]" -LINE 1: SELECT '[1, ,3]'::vector; - ^ -SELECT '[1,2,3]'::vector(2); -ERROR: expected 2 dimensions, not 3 -SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::vector[]); - unnest ---------- - [1,2,3] - [4,5,6] -(2 rows) - -SELECT '{"[1,2,3]"}'::vector(2)[]; -ERROR: expected 2 dimensions, not 3 diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_cosine.out b/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_cosine.out deleted file mode 100644 index 96db5e0b42d3..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_cosine.out +++ /dev/null @@ -1,19 +0,0 @@ -SET enable_seqscan = off; -CREATE TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE INDEX ON t USING ivfflat (val vector_cosine_ops) WITH (lists = 1); -INSERT INTO t (val) VALUES ('[1,2,4]'); -SELECT * FROM t ORDER BY val <=> '[3,3,3]'; - val ---------- - [1,1,1] - [1,2,3] - [1,2,4] -(3 rows) - -SELECT * FROM t ORDER BY val <=> (SELECT NULL::vector); - val ------ -(0 rows) - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_ip.out b/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_ip.out deleted file mode 100644 index d4fc5380952a..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_ip.out +++ /dev/null @@ -1,20 +0,0 @@ -SET enable_seqscan = off; -CREATE TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE INDEX ON t USING ivfflat (val vector_ip_ops) WITH (lists = 1); -INSERT INTO t (val) VALUES ('[1,2,4]'); -SELECT * FROM t ORDER BY val <#> '[3,3,3]'; - val ---------- - [1,2,4] - [1,2,3] - [1,1,1] - [0,0,0] -(4 rows) - -SELECT * FROM t ORDER BY val <#> (SELECT NULL::vector); - val ------ -(0 rows) - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_l2.out b/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_l2.out deleted file mode 100644 index 2e8c6c2573a1..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_l2.out +++ /dev/null @@ -1,26 +0,0 @@ -SET enable_seqscan = off; -CREATE TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE INDEX ON t USING ivfflat (val) WITH (lists = 1); -INSERT INTO t (val) VALUES ('[1,2,4]'); -SELECT * FROM t ORDER BY val <-> '[3,3,3]'; - val ---------- - [1,2,3] - [1,2,4] - [1,1,1] - [0,0,0] -(4 rows) - -SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); - val ------ -(0 rows) - -SELECT COUNT(*) FROM t; - count -------- - 5 -(1 row) - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_options.out b/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_options.out deleted file mode 100644 index 405a75d3e85f..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_options.out +++ /dev/null @@ -1,15 +0,0 @@ -SET enable_seqscan = off; -CREATE TABLE t (val vector(3)); -CREATE INDEX ON t USING ivfflat (val) WITH (lists = 0); -ERROR: value 0 out of bounds for option "lists" -DETAIL: Valid values are between "1" and "32768". -CREATE INDEX ON t USING ivfflat (val) WITH (lists = 32769); -ERROR: value 32769 out of bounds for option "lists" -DETAIL: Valid values are between "1" and "32768". -SHOW ivfflat.probes; - ivfflat.probes ----------------- - 1 -(1 row) - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_unlogged.out b/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_unlogged.out deleted file mode 100644 index 198ea97a2251..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/expected/ivfflat_unlogged.out +++ /dev/null @@ -1,13 +0,0 @@ -SET enable_seqscan = off; -CREATE UNLOGGED TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE INDEX ON t USING ivfflat (val) WITH (lists = 1); -SELECT * FROM t ORDER BY val <-> '[3,3,3]'; - val ---------- - [1,2,3] - [1,1,1] - [0,0,0] -(3 rows) - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/sparsevec.out b/src/postgres/third-party-extensions/pgvector/test/expected/sparsevec.out new file mode 100644 index 000000000000..989ec82bc8eb --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/expected/sparsevec.out @@ -0,0 +1,653 @@ +SELECT '{1:1.5,3:3.5}/5'::sparsevec; + sparsevec +----------------- + {1:1.5,3:3.5}/5 +(1 row) + +SELECT '{1:-2,3:-4}/5'::sparsevec; + sparsevec +--------------- + {1:-2,3:-4}/5 +(1 row) + +SELECT '{1:2.,3:4.}/5'::sparsevec; + sparsevec +------------- + {1:2,3:4}/5 +(1 row) + +SELECT ' { 1 : 1.5 , 3 : 3.5 } / 5 '::sparsevec; + sparsevec +----------------- + {1:1.5,3:3.5}/5 +(1 row) + +SELECT '{1:1.23456}/1'::sparsevec; + sparsevec +--------------- + {1:1.23456}/1 +(1 row) + +SELECT '{1:hello,2:1}/2'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{1:hello,2:1}/2" +LINE 1: SELECT '{1:hello,2:1}/2'::sparsevec; + ^ +SELECT '{1:NaN,2:1}/2'::sparsevec; +ERROR: NaN not allowed in sparsevec +LINE 1: SELECT '{1:NaN,2:1}/2'::sparsevec; + ^ +SELECT '{1:Infinity,2:1}/2'::sparsevec; +ERROR: infinite value not allowed in sparsevec +LINE 1: SELECT '{1:Infinity,2:1}/2'::sparsevec; + ^ +SELECT '{1:-Infinity,2:1}/2'::sparsevec; +ERROR: infinite value not allowed in sparsevec +LINE 1: SELECT '{1:-Infinity,2:1}/2'::sparsevec; + ^ +SELECT '{1:1.5e38,2:-1.5e38}/2'::sparsevec; + sparsevec +-------------------------- + {1:1.5e+38,2:-1.5e+38}/2 +(1 row) + +SELECT '{1:1.5e+38,2:-1.5e+38}/2'::sparsevec; + sparsevec +-------------------------- + {1:1.5e+38,2:-1.5e+38}/2 +(1 row) + +SELECT '{1:1.5e-38,2:-1.5e-38}/2'::sparsevec; + sparsevec +-------------------------- + {1:1.5e-38,2:-1.5e-38}/2 +(1 row) + +SELECT '{1:4e38,2:1}/2'::sparsevec; +ERROR: "4e38" is out of range for type sparsevec +LINE 1: SELECT '{1:4e38,2:1}/2'::sparsevec; + ^ +SELECT '{1:-4e38,2:1}/2'::sparsevec; +ERROR: "-4e38" is out of range for type sparsevec +LINE 1: SELECT '{1:-4e38,2:1}/2'::sparsevec; + ^ +SELECT '{1:1e-46,2:1}/2'::sparsevec; +ERROR: "1e-46" is out of range for type sparsevec +LINE 1: SELECT '{1:1e-46,2:1}/2'::sparsevec; + ^ +SELECT '{1:-1e-46,2:1}/2'::sparsevec; +ERROR: "-1e-46" is out of range for type sparsevec +LINE 1: SELECT '{1:-1e-46,2:1}/2'::sparsevec; + ^ +SELECT ''::sparsevec; +ERROR: invalid input syntax for type sparsevec: "" +LINE 1: SELECT ''::sparsevec; + ^ +DETAIL: Vector contents must start with "{". +SELECT '{'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{" +LINE 1: SELECT '{'::sparsevec; + ^ +SELECT '{ '::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{ " +LINE 1: SELECT '{ '::sparsevec; + ^ +SELECT '{:'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{:" +LINE 1: SELECT '{:'::sparsevec; + ^ +SELECT '{,'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{," +LINE 1: SELECT '{,'::sparsevec; + ^ +SELECT '{}'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{}" +LINE 1: SELECT '{}'::sparsevec; + ^ +DETAIL: Unexpected end of input. +SELECT '{}/'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{}/" +LINE 1: SELECT '{}/'::sparsevec; + ^ +SELECT '{}/1'::sparsevec; + sparsevec +----------- + {}/1 +(1 row) + +SELECT '{}/1a'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{}/1a" +LINE 1: SELECT '{}/1a'::sparsevec; + ^ +DETAIL: Junk after closing. +SELECT '{ }/1'::sparsevec; + sparsevec +----------- + {}/1 +(1 row) + +SELECT '{:}/1'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{:}/1" +LINE 1: SELECT '{:}/1'::sparsevec; + ^ +SELECT '{,}/1'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{,}/1" +LINE 1: SELECT '{,}/1'::sparsevec; + ^ +SELECT '{1,}/1'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{1,}/1" +LINE 1: SELECT '{1,}/1'::sparsevec; + ^ +SELECT '{:1}/1'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{:1}/1" +LINE 1: SELECT '{:1}/1'::sparsevec; + ^ +SELECT '{1:}/1'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{1:}/1" +LINE 1: SELECT '{1:}/1'::sparsevec; + ^ +SELECT '{1a:1}/1'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{1a:1}/1" +LINE 1: SELECT '{1a:1}/1'::sparsevec; + ^ +SELECT '{1:1a}/1'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{1:1a}/1" +LINE 1: SELECT '{1:1a}/1'::sparsevec; + ^ +SELECT '{1:1,}/1'::sparsevec; +ERROR: invalid input syntax for type sparsevec: "{1:1,}/1" +LINE 1: SELECT '{1:1,}/1'::sparsevec; + ^ +SELECT '{1:0,2:1,3:0}/3'::sparsevec; + sparsevec +----------- + {2:1}/3 +(1 row) + +SELECT '{2:1,1:1}/2'::sparsevec; + sparsevec +------------- + {1:1,2:1}/2 +(1 row) + +SELECT '{1:1,1:1}/2'::sparsevec; +ERROR: sparsevec indices must not contain duplicates +LINE 1: SELECT '{1:1,1:1}/2'::sparsevec; + ^ +SELECT '{1:1,2:1,1:1}/2'::sparsevec; +ERROR: sparsevec indices must not contain duplicates +LINE 1: SELECT '{1:1,2:1,1:1}/2'::sparsevec; + ^ +SELECT '{}/5'::sparsevec; + sparsevec +----------- + {}/5 +(1 row) + +SELECT '{}/-1'::sparsevec; +ERROR: sparsevec must have at least 1 dimension +LINE 1: SELECT '{}/-1'::sparsevec; + ^ +SELECT '{}/1000000001'::sparsevec; +ERROR: sparsevec cannot have more than 1000000000 dimensions +LINE 1: SELECT '{}/1000000001'::sparsevec; + ^ +SELECT '{}/2147483648'::sparsevec; +ERROR: sparsevec cannot have more than 1000000000 dimensions +LINE 1: SELECT '{}/2147483648'::sparsevec; + ^ +SELECT '{}/-2147483649'::sparsevec; +ERROR: sparsevec must have at least 1 dimension +LINE 1: SELECT '{}/-2147483649'::sparsevec; + ^ +SELECT '{}/9223372036854775808'::sparsevec; +ERROR: sparsevec cannot have more than 1000000000 dimensions +LINE 1: SELECT '{}/9223372036854775808'::sparsevec; + ^ +SELECT '{}/-9223372036854775809'::sparsevec; +ERROR: sparsevec must have at least 1 dimension +LINE 1: SELECT '{}/-9223372036854775809'::sparsevec; + ^ +SELECT '{2147483647:1}/1'::sparsevec; +ERROR: sparsevec index out of bounds +LINE 1: SELECT '{2147483647:1}/1'::sparsevec; + ^ +SELECT '{2147483648:1}/1'::sparsevec; +ERROR: sparsevec index out of bounds +LINE 1: SELECT '{2147483648:1}/1'::sparsevec; + ^ +SELECT '{-2147483648:1}/1'::sparsevec; +ERROR: sparsevec index out of bounds +LINE 1: SELECT '{-2147483648:1}/1'::sparsevec; + ^ +SELECT '{-2147483649:1}/1'::sparsevec; +ERROR: sparsevec index out of bounds +LINE 1: SELECT '{-2147483649:1}/1'::sparsevec; + ^ +SELECT '{0:1}/1'::sparsevec; +ERROR: sparsevec index out of bounds +LINE 1: SELECT '{0:1}/1'::sparsevec; + ^ +SELECT '{2:1}/1'::sparsevec; +ERROR: sparsevec index out of bounds +LINE 1: SELECT '{2:1}/1'::sparsevec; + ^ +SELECT '{}/3'::sparsevec(3); + sparsevec +----------- + {}/3 +(1 row) + +SELECT '{}/3'::sparsevec(2); +ERROR: expected 2 dimensions, not 3 +SELECT '{}/3'::sparsevec(3, 2); +ERROR: invalid type modifier +LINE 1: SELECT '{}/3'::sparsevec(3, 2); + ^ +SELECT '{}/3'::sparsevec('a'); +ERROR: invalid input syntax for type integer: "a" +LINE 1: SELECT '{}/3'::sparsevec('a'); + ^ +SELECT '{}/3'::sparsevec(0); +ERROR: dimensions for type sparsevec must be at least 1 +LINE 1: SELECT '{}/3'::sparsevec(0); + ^ +SELECT '{}/3'::sparsevec(1000000001); +ERROR: dimensions for type sparsevec cannot exceed 1000000000 +LINE 1: SELECT '{}/3'::sparsevec(1000000001); + ^ +SELECT '{1:1,2:2,3:3}/3'::sparsevec < '{1:1,2:2,3:3}/3'; + ?column? +---------- + f +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec < '{1:1,2:2}/2'; + ?column? +---------- + f +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec <= '{1:1,2:2,3:3}/3'; + ?column? +---------- + t +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec <= '{1:1,2:2}/2'; + ?column? +---------- + f +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec = '{1:1,2:2,3:3}/3'; + ?column? +---------- + t +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec = '{1:1,2:2}/2'; + ?column? +---------- + f +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec != '{1:1,2:2,3:3}/3'; + ?column? +---------- + f +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec != '{1:1,2:2}/2'; + ?column? +---------- + t +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec >= '{1:1,2:2,3:3}/3'; + ?column? +---------- + t +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec >= '{1:1,2:2}/2'; + ?column? +---------- + t +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec > '{1:1,2:2,3:3}/3'; + ?column? +---------- + f +(1 row) + +SELECT '{1:1,2:2,3:3}/3'::sparsevec > '{1:1,2:2}/2'; + ?column? +---------- + t +(1 row) + +SELECT sparsevec_cmp('{1:1,2:2,3:3}/3', '{1:1,2:2,3:3}/3'); + sparsevec_cmp +--------------- + 0 +(1 row) + +SELECT sparsevec_cmp('{1:1,2:2,3:3}/3', '{}/3'); + sparsevec_cmp +--------------- + 1 +(1 row) + +SELECT sparsevec_cmp('{}/3', '{1:1,2:2,3:3}/3'); + sparsevec_cmp +--------------- + -1 +(1 row) + +SELECT sparsevec_cmp('{1:1,2:2}/2', '{1:1,2:2,3:3}/3'); + sparsevec_cmp +--------------- + -1 +(1 row) + +SELECT sparsevec_cmp('{1:1,2:2,3:3}/3', '{1:1,2:2}/2'); + sparsevec_cmp +--------------- + 1 +(1 row) + +SELECT sparsevec_cmp('{1:1,2:2}/2', '{1:2,2:3,3:4}/3'); + sparsevec_cmp +--------------- + -1 +(1 row) + +SELECT sparsevec_cmp('{1:2,2:3}/2', '{1:1,2:2,3:3}/3'); + sparsevec_cmp +--------------- + 1 +(1 row) + +SELECT round(l2_norm('{1:1,2:1}/2'::sparsevec)::numeric, 5); + round +--------- + 1.41421 +(1 row) + +SELECT l2_norm('{1:3,2:4}/2'::sparsevec); + l2_norm +--------- + 5 +(1 row) + +SELECT l2_norm('{2:1}/2'::sparsevec); + l2_norm +--------- + 1 +(1 row) + +SELECT l2_norm('{1:3e37,2:4e37}/2'::sparsevec)::real; + l2_norm +--------- + 5e+37 +(1 row) + +SELECT l2_norm('{}/2'::sparsevec); + l2_norm +--------- + 0 +(1 row) + +SELECT l2_norm('{1:2}/1'::sparsevec); + l2_norm +--------- + 2 +(1 row) + +SELECT l2_distance('{}/2'::sparsevec, '{1:3,2:4}/2'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('{1:3}/2'::sparsevec, '{2:4}/2'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('{2:4}/2'::sparsevec, '{1:3}/2'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('{1:3,2:4}/2'::sparsevec, '{}/2'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('{}/2'::sparsevec, '{2:1}/2'); + l2_distance +------------- + 1 +(1 row) + +SELECT '{}/2'::sparsevec <-> '{1:3,2:4}/2'; + ?column? +---------- + 5 +(1 row) + +SELECT inner_product('{1:1,2:2}/2'::sparsevec, '{1:2,2:4}/2'); + inner_product +--------------- + 10 +(1 row) + +SELECT inner_product('{1:1,2:2}/2'::sparsevec, '{1:3}/1'); +ERROR: different sparsevec dimensions 2 and 1 +SELECT inner_product('{1:1,3:3}/4'::sparsevec, '{2:2,4:4}/4'); + inner_product +--------------- + 0 +(1 row) + +SELECT inner_product('{2:2,4:4}/4'::sparsevec, '{1:1,3:3}/4'); + inner_product +--------------- + 0 +(1 row) + +SELECT inner_product('{1:1,3:3,5:5}/5'::sparsevec, '{2:4,3:6,4:8}/5'); + inner_product +--------------- + 18 +(1 row) + +SELECT inner_product('{1:1}/2'::sparsevec, '{}/2'); + inner_product +--------------- + 0 +(1 row) + +SELECT inner_product('{}/2'::sparsevec, '{1:1}/2'); + inner_product +--------------- + 0 +(1 row) + +SELECT inner_product('{1:3e38}/1'::sparsevec, '{1:3e38}/1'); + inner_product +--------------- + Infinity +(1 row) + +SELECT inner_product('{1:1,3:3,5:5}/5'::sparsevec, '{2:4,3:6,4:8}/5'); + inner_product +--------------- + 18 +(1 row) + +SELECT '{1:1,2:2}/2'::sparsevec <#> '{1:3,2:4}/2'; + ?column? +---------- + -11 +(1 row) + +SELECT cosine_distance('{1:1,2:2}/2'::sparsevec, '{1:2,2:4}/2'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('{1:1,2:2}/2'::sparsevec, '{}/2'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('{1:1,2:1}/2'::sparsevec, '{1:1,2:1}/2'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('{1:1}/2'::sparsevec, '{2:2}/2'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('{1:1,2:1}/2'::sparsevec, '{1:-1,2:-1}/2'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('{1:2}/2'::sparsevec, '{2:2}/2'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('{2:2}/2'::sparsevec, '{1:2}/2'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('{1:1,2:2}/2'::sparsevec, '{1:3}/1'); +ERROR: different sparsevec dimensions 2 and 1 +SELECT cosine_distance('{1:1,2:1}/2'::sparsevec, '{1:1.1,2:1.1}/2'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('{1:1,2:1}/2'::sparsevec, '{1:-1.1,2:-1.1}/2'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('{1:3e38}/1'::sparsevec, '{1:3e38}/1'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('{}/1'::sparsevec, '{}/1'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT '{1:1,2:2}/2'::sparsevec <=> '{1:2,2:4}/2'; + ?column? +---------- + 0 +(1 row) + +SELECT l1_distance('{}/2'::sparsevec, '{1:3,2:4}/2'); + l1_distance +------------- + 7 +(1 row) + +SELECT l1_distance('{}/2'::sparsevec, '{2:1}/2'); + l1_distance +------------- + 1 +(1 row) + +SELECT l1_distance('{1:1,2:2}/2'::sparsevec, '{1:3}/1'); +ERROR: different sparsevec dimensions 2 and 1 +SELECT l1_distance('{1:3e38}/1'::sparsevec, '{1:-3e38}/1'); + l1_distance +------------- + Infinity +(1 row) + +SELECT l1_distance('{1:1,3:3,5:5,7:7}/8'::sparsevec, '{2:2,4:4,6:6,8:8}/8'); + l1_distance +------------- + 36 +(1 row) + +SELECT l1_distance('{1:1,3:3,5:5,7:7,9:9}/9'::sparsevec, '{2:2,4:4,6:6,8:8}/9'); + l1_distance +------------- + 45 +(1 row) + +SELECT '{}/2'::sparsevec <+> '{1:3,2:4}/2'; + ?column? +---------- + 7 +(1 row) + +SELECT l2_normalize('{1:3,2:4}/2'::sparsevec); + l2_normalize +----------------- + {1:0.6,2:0.8}/2 +(1 row) + +SELECT l2_normalize('{1:3}/2'::sparsevec); + l2_normalize +-------------- + {1:1}/2 +(1 row) + +SELECT l2_normalize('{2:0.1}/2'::sparsevec); + l2_normalize +-------------- + {2:1}/2 +(1 row) + +SELECT l2_normalize('{}/2'::sparsevec); + l2_normalize +-------------- + {}/2 +(1 row) + +SELECT l2_normalize('{1:3e38}/1'::sparsevec); + l2_normalize +-------------- + {1:1}/1 +(1 row) + +SELECT l2_normalize('{1:3e38,2:1e-37}/2'::sparsevec); + l2_normalize +-------------- + {1:1}/2 +(1 row) + +SELECT l2_normalize('{2:3e37,4:3e-37,6:4e37,8:4e-37}/9'::sparsevec); + l2_normalize +----------------- + {2:0.6,6:0.8}/9 +(1 row) + diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/vector_type.out b/src/postgres/third-party-extensions/pgvector/test/expected/vector_type.out new file mode 100644 index 000000000000..67486582241c --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/expected/vector_type.out @@ -0,0 +1,672 @@ +SELECT '[1,2,3]'::vector; + vector +--------- + [1,2,3] +(1 row) + +SELECT '[-1,-2,-3]'::vector; + vector +------------ + [-1,-2,-3] +(1 row) + +SELECT '[1.,2.,3.]'::vector; + vector +--------- + [1,2,3] +(1 row) + +SELECT ' [ 1, 2 , 3 ] '::vector; + vector +--------- + [1,2,3] +(1 row) + +SELECT '[1.23456]'::vector; + vector +----------- + [1.23456] +(1 row) + +SELECT '[hello,1]'::vector; +ERROR: invalid input syntax for type vector: "[hello,1]" +LINE 1: SELECT '[hello,1]'::vector; + ^ +SELECT '[NaN,1]'::vector; +ERROR: NaN not allowed in vector +LINE 1: SELECT '[NaN,1]'::vector; + ^ +SELECT '[Infinity,1]'::vector; +ERROR: infinite value not allowed in vector +LINE 1: SELECT '[Infinity,1]'::vector; + ^ +SELECT '[-Infinity,1]'::vector; +ERROR: infinite value not allowed in vector +LINE 1: SELECT '[-Infinity,1]'::vector; + ^ +SELECT '[1.5e38,-1.5e38]'::vector; + vector +-------------------- + [1.5e+38,-1.5e+38] +(1 row) + +SELECT '[1.5e+38,-1.5e+38]'::vector; + vector +-------------------- + [1.5e+38,-1.5e+38] +(1 row) + +SELECT '[1.5e-38,-1.5e-38]'::vector; + vector +-------------------- + [1.5e-38,-1.5e-38] +(1 row) + +SELECT '[4e38,1]'::vector; +ERROR: "4e38" is out of range for type vector +LINE 1: SELECT '[4e38,1]'::vector; + ^ +SELECT '[-4e38,1]'::vector; +ERROR: "-4e38" is out of range for type vector +LINE 1: SELECT '[-4e38,1]'::vector; + ^ +SELECT '[1e-46,1]'::vector; + vector +-------- + [0,1] +(1 row) + +SELECT '[-1e-46,1]'::vector; + vector +-------- + [-0,1] +(1 row) + +SELECT '[1,2,3'::vector; +ERROR: invalid input syntax for type vector: "[1,2,3" +LINE 1: SELECT '[1,2,3'::vector; + ^ +SELECT '[1,2,3]9'::vector; +ERROR: invalid input syntax for type vector: "[1,2,3]9" +LINE 1: SELECT '[1,2,3]9'::vector; + ^ +DETAIL: Junk after closing right brace. +SELECT '1,2,3'::vector; +ERROR: invalid input syntax for type vector: "1,2,3" +LINE 1: SELECT '1,2,3'::vector; + ^ +DETAIL: Vector contents must start with "[". +SELECT ''::vector; +ERROR: invalid input syntax for type vector: "" +LINE 1: SELECT ''::vector; + ^ +DETAIL: Vector contents must start with "[". +SELECT '['::vector; +ERROR: invalid input syntax for type vector: "[" +LINE 1: SELECT '['::vector; + ^ +SELECT '[ '::vector; +ERROR: invalid input syntax for type vector: "[ " +LINE 1: SELECT '[ '::vector; + ^ +SELECT '[,'::vector; +ERROR: invalid input syntax for type vector: "[," +LINE 1: SELECT '[,'::vector; + ^ +SELECT '[]'::vector; +ERROR: vector must have at least 1 dimension +LINE 1: SELECT '[]'::vector; + ^ +SELECT '[ ]'::vector; +ERROR: vector must have at least 1 dimension +LINE 1: SELECT '[ ]'::vector; + ^ +SELECT '[,]'::vector; +ERROR: invalid input syntax for type vector: "[,]" +LINE 1: SELECT '[,]'::vector; + ^ +SELECT '[1,]'::vector; +ERROR: invalid input syntax for type vector: "[1,]" +LINE 1: SELECT '[1,]'::vector; + ^ +SELECT '[1a]'::vector; +ERROR: invalid input syntax for type vector: "[1a]" +LINE 1: SELECT '[1a]'::vector; + ^ +SELECT '[1,,3]'::vector; +ERROR: invalid input syntax for type vector: "[1,,3]" +LINE 1: SELECT '[1,,3]'::vector; + ^ +SELECT '[1, ,3]'::vector; +ERROR: invalid input syntax for type vector: "[1, ,3]" +LINE 1: SELECT '[1, ,3]'::vector; + ^ +SELECT '[1,2,3]'::vector(3); + vector +--------- + [1,2,3] +(1 row) + +SELECT '[1,2,3]'::vector(2); +ERROR: expected 2 dimensions, not 3 +SELECT '[1,2,3]'::vector(3, 2); +ERROR: invalid type modifier +LINE 1: SELECT '[1,2,3]'::vector(3, 2); + ^ +SELECT '[1,2,3]'::vector('a'); +ERROR: invalid input syntax for type integer: "a" +LINE 1: SELECT '[1,2,3]'::vector('a'); + ^ +SELECT '[1,2,3]'::vector(0); +ERROR: dimensions for type vector must be at least 1 +LINE 1: SELECT '[1,2,3]'::vector(0); + ^ +SELECT '[1,2,3]'::vector(16001); +ERROR: dimensions for type vector cannot exceed 16000 +LINE 1: SELECT '[1,2,3]'::vector(16001); + ^ +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::vector[]); + unnest +--------- + [1,2,3] + [4,5,6] +(2 rows) + +SELECT '{"[1,2,3]"}'::vector(2)[]; +ERROR: expected 2 dimensions, not 3 +SELECT '[1,2,3]'::vector + '[4,5,6]'; + ?column? +---------- + [5,7,9] +(1 row) + +SELECT '[3e38]'::vector + '[3e38]'; +ERROR: value out of range: overflow +SELECT '[1,2]'::vector + '[3]'; +ERROR: different vector dimensions 2 and 1 +SELECT '[1,2,3]'::vector - '[4,5,6]'; + ?column? +------------ + [-3,-3,-3] +(1 row) + +SELECT '[-3e38]'::vector - '[3e38]'; +ERROR: value out of range: overflow +SELECT '[1,2]'::vector - '[3]'; +ERROR: different vector dimensions 2 and 1 +SELECT '[1,2,3]'::vector * '[4,5,6]'; + ?column? +----------- + [4,10,18] +(1 row) + +SELECT '[1e37]'::vector * '[1e37]'; +ERROR: value out of range: overflow +SELECT '[1e-37]'::vector * '[1e-37]'; +ERROR: value out of range: underflow +SELECT '[1,2]'::vector * '[3]'; +ERROR: different vector dimensions 2 and 1 +SELECT '[1,2,3]'::vector || '[4,5]'; + ?column? +------------- + [1,2,3,4,5] +(1 row) + +SELECT array_fill(0, ARRAY[16000])::vector || '[1]'; +ERROR: vector cannot have more than 16000 dimensions +SELECT '[1,2,3]'::vector < '[1,2,3]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::vector < '[1,2]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::vector <= '[1,2,3]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::vector <= '[1,2]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::vector = '[1,2,3]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::vector = '[1,2]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::vector != '[1,2,3]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::vector != '[1,2]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::vector >= '[1,2,3]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::vector >= '[1,2]'; + ?column? +---------- + t +(1 row) + +SELECT '[1,2,3]'::vector > '[1,2,3]'; + ?column? +---------- + f +(1 row) + +SELECT '[1,2,3]'::vector > '[1,2]'; + ?column? +---------- + t +(1 row) + +SELECT vector_cmp('[1,2,3]', '[1,2,3]'); + vector_cmp +------------ + 0 +(1 row) + +SELECT vector_cmp('[1,2,3]', '[0,0,0]'); + vector_cmp +------------ + 1 +(1 row) + +SELECT vector_cmp('[0,0,0]', '[1,2,3]'); + vector_cmp +------------ + -1 +(1 row) + +SELECT vector_cmp('[1,2]', '[1,2,3]'); + vector_cmp +------------ + -1 +(1 row) + +SELECT vector_cmp('[1,2,3]', '[1,2]'); + vector_cmp +------------ + 1 +(1 row) + +SELECT vector_cmp('[1,2]', '[2,3,4]'); + vector_cmp +------------ + -1 +(1 row) + +SELECT vector_cmp('[2,3]', '[1,2,3]'); + vector_cmp +------------ + 1 +(1 row) + +SELECT vector_dims('[1,2,3]'::vector); + vector_dims +------------- + 3 +(1 row) + +SELECT round(vector_norm('[1,1]')::numeric, 5); + round +--------- + 1.41421 +(1 row) + +SELECT vector_norm('[3,4]'); + vector_norm +------------- + 5 +(1 row) + +SELECT vector_norm('[0,1]'); + vector_norm +------------- + 1 +(1 row) + +SELECT vector_norm('[3e37,4e37]')::real; + vector_norm +------------- + 5e+37 +(1 row) + +SELECT vector_norm('[0,0]'); + vector_norm +------------- + 0 +(1 row) + +SELECT vector_norm('[2]'); + vector_norm +------------- + 2 +(1 row) + +SELECT l2_distance('[0,0]'::vector, '[3,4]'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('[0,0]'::vector, '[0,1]'); + l2_distance +------------- + 1 +(1 row) + +SELECT l2_distance('[1,2]'::vector, '[3]'); +ERROR: different vector dimensions 2 and 1 +SELECT l2_distance('[3e38]'::vector, '[-3e38]'); + l2_distance +------------- + Infinity +(1 row) + +SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::vector, '[1,1,1,1,1,1,1,4,5]'); + l2_distance +------------- + 5 +(1 row) + +SELECT '[0,0]'::vector <-> '[3,4]'; + ?column? +---------- + 5 +(1 row) + +SELECT inner_product('[1,2]'::vector, '[3,4]'); + inner_product +--------------- + 11 +(1 row) + +SELECT inner_product('[1,2]'::vector, '[3]'); +ERROR: different vector dimensions 2 and 1 +SELECT inner_product('[3e38]'::vector, '[3e38]'); + inner_product +--------------- + Infinity +(1 row) + +SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::vector, '[1,2,3,4,5,6,7,8,9]'); + inner_product +--------------- + 45 +(1 row) + +SELECT '[1,2]'::vector <#> '[3,4]'; + ?column? +---------- + -11 +(1 row) + +SELECT cosine_distance('[1,2]'::vector, '[2,4]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2]'::vector, '[0,0]'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('[1,1]'::vector, '[1,1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,0]'::vector, '[0,2]'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('[1,1]'::vector, '[-1,-1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[1,2]'::vector, '[3]'); +ERROR: different vector dimensions 2 and 1 +SELECT cosine_distance('[1,1]'::vector, '[1.1,1.1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,1]'::vector, '[-1.1,-1.1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[3e38]'::vector, '[3e38]'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::vector, '[1,2,3,4,5,6,7,8,9]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::vector, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT '[1,2]'::vector <=> '[2,4]'; + ?column? +---------- + 0 +(1 row) + +SELECT l1_distance('[0,0]'::vector, '[3,4]'); + l1_distance +------------- + 7 +(1 row) + +SELECT l1_distance('[0,0]'::vector, '[0,1]'); + l1_distance +------------- + 1 +(1 row) + +SELECT l1_distance('[1,2]'::vector, '[3]'); +ERROR: different vector dimensions 2 and 1 +SELECT l1_distance('[3e38]'::vector, '[-3e38]'); + l1_distance +------------- + Infinity +(1 row) + +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::vector, '[1,2,3,4,5,6,7,8,9]'); + l1_distance +------------- + 0 +(1 row) + +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::vector, '[0,3,2,5,4,7,6,9,8]'); + l1_distance +------------- + 9 +(1 row) + +SELECT '[0,0]'::vector <+> '[3,4]'; + ?column? +---------- + 7 +(1 row) + +SELECT l2_normalize('[3,4]'::vector); + l2_normalize +-------------- + [0.6,0.8] +(1 row) + +SELECT l2_normalize('[3,0]'::vector); + l2_normalize +-------------- + [1,0] +(1 row) + +SELECT l2_normalize('[0,0.1]'::vector); + l2_normalize +-------------- + [0,1] +(1 row) + +SELECT l2_normalize('[0,0]'::vector); + l2_normalize +-------------- + [0,0] +(1 row) + +SELECT l2_normalize('[3e38]'::vector); + l2_normalize +-------------- + [1] +(1 row) + +SELECT binary_quantize('[1,0,-1]'::vector); + binary_quantize +----------------- + 100 +(1 row) + +SELECT binary_quantize('[0,0.1,-0.2,-0.3,0.4,0.5,0.6,-0.7,0.8,-0.9,1]'::vector); + binary_quantize +----------------- + 01001110101 +(1 row) + +SELECT subvector('[1,2,3,4,5]'::vector, 1, 3); + subvector +----------- + [1,2,3] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::vector, 3, 2); + subvector +----------- + [3,4] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::vector, -1, 3); + subvector +----------- + [1] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::vector, 3, 9); + subvector +----------- + [3,4,5] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::vector, 1, 0); +ERROR: vector must have at least 1 dimension +SELECT subvector('[1,2,3,4,5]'::vector, 3, -1); +ERROR: vector must have at least 1 dimension +SELECT subvector('[1,2,3,4,5]'::vector, -1, 2); +ERROR: vector must have at least 1 dimension +SELECT subvector('[1,2,3,4,5]'::vector, 2147483647, 10); +ERROR: vector must have at least 1 dimension +SELECT subvector('[1,2,3,4,5]'::vector, 3, 2147483647); + subvector +----------- + [3,4,5] +(1 row) + +SELECT subvector('[1,2,3,4,5]'::vector, -2147483644, 2147483647); + subvector +----------- + [1,2] +(1 row) + +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; + avg +----------- + [2,3.5,5] +(1 row) + +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; + avg +----------- + [2,3.5,5] +(1 row) + +SELECT avg(v) FROM unnest(ARRAY[]::vector[]) v; + avg +----- + +(1 row) + +SELECT avg(v) FROM unnest(ARRAY['[1,2]'::vector, '[3]']) v; +ERROR: expected 2 dimensions, not 1 +SELECT avg(v) FROM unnest(ARRAY['[3e38]'::vector, '[3e38]']) v; + avg +--------- + [3e+38] +(1 row) + +SELECT vector_avg(array_agg(n)) FROM generate_series(1, 16002) n; +ERROR: vector cannot have more than 16000 dimensions +SELECT sum(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; + sum +---------- + [4,7,10] +(1 row) + +SELECT sum(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; + sum +---------- + [4,7,10] +(1 row) + +SELECT sum(v) FROM unnest(ARRAY[]::vector[]) v; + sum +----- + +(1 row) + +SELECT sum(v) FROM unnest(ARRAY['[1,2]'::vector, '[3]']) v; +ERROR: different vector dimensions 2 and 1 +SELECT sum(v) FROM unnest(ARRAY['[3e38]'::vector, '[3e38]']) v; +ERROR: value out of range: overflow diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/yb.orig.setup.out b/src/postgres/third-party-extensions/pgvector/test/expected/yb.orig.setup.out index 05a495838412..c456f5eb72ea 100644 --- a/src/postgres/third-party-extensions/pgvector/test/expected/yb.orig.setup.out +++ b/src/postgres/third-party-extensions/pgvector/test/expected/yb.orig.setup.out @@ -1 +1 @@ -CREATE EXTENSION vector VERSION '0.4.4-yb-1.2'; +CREATE EXTENSION vector VERSION '0.8.0-yb-1.0'; diff --git a/src/postgres/third-party-extensions/pgvector/test/expected/yb.port.copy.out b/src/postgres/third-party-extensions/pgvector/test/expected/yb.port.copy.out new file mode 100644 index 000000000000..db9f330ab7ab --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/expected/yb.port.copy.out @@ -0,0 +1,51 @@ +-- vector +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE TABLE t2 (val vector(3)); +\copy t TO '/tmp/vector.bin' WITH (FORMAT binary) +\copy t2 FROM '/tmp/vector.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +--------- + [0,0,0] + [1,1,1] + [1,2,3] + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; +-- halfvec +CREATE TABLE t (val halfvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE TABLE t2 (val halfvec(3)); +\copy t TO '/tmp/halfvec.bin' WITH (FORMAT binary) +\copy t2 FROM '/tmp/halfvec.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +--------- + [0,0,0] + [1,1,1] + [1,2,3] + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; +-- sparsevec +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{1:1,2:2,3:3}/3'), ('{1:1,2:1,3:1}/3'), (NULL); +CREATE TABLE t2 (val sparsevec(3)); +\copy t TO '/tmp/sparsevec.bin' WITH (FORMAT binary) +\copy t2 FROM '/tmp/sparsevec.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; + val +----------------- + {}/3 + {1:1,2:1,3:1}/3 + {1:1,2:2,3:3}/3 + +(4 rows) + +DROP TABLE t; +DROP TABLE t2; diff --git a/src/postgres/third-party-extensions/pgvector/test/perl/PostgresNode.pm b/src/postgres/third-party-extensions/pgvector/test/perl/PostgresNode.pm deleted file mode 100644 index 32ffb95fa99d..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/perl/PostgresNode.pm +++ /dev/null @@ -1,8 +0,0 @@ -use PostgreSQL::Test::Cluster; - -sub get_new_node -{ - return PostgreSQL::Test::Cluster->new(@_); -} - -1; diff --git a/src/postgres/third-party-extensions/pgvector/test/perl/TestLib.pm b/src/postgres/third-party-extensions/pgvector/test/perl/TestLib.pm deleted file mode 100644 index 1cb2a826e3c0..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/perl/TestLib.pm +++ /dev/null @@ -1,3 +0,0 @@ -use PostgreSQL::Test::Utils; - -1; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/bit.sql b/src/postgres/third-party-extensions/pgvector/test/sql/bit.sql new file mode 100644 index 000000000000..eee9ce60cfba --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/sql/bit.sql @@ -0,0 +1,27 @@ +SELECT hamming_distance('111', '111'); +SELECT hamming_distance('111', '110'); +SELECT hamming_distance('111', '100'); +SELECT hamming_distance('111', '000'); +SELECT hamming_distance('10101010101010101010', '01010101010101010101'); +SELECT hamming_distance('101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101', '101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101'); +SELECT hamming_distance('101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101', '010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010'); +SELECT hamming_distance('110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011', '100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001'); +SELECT hamming_distance('', ''); +SELECT hamming_distance('111', '00'); +SELECT hamming_distance('111', '000'::varbit(4)); +SELECT hamming_distance('111', '0000'::varbit(4)); + +SELECT jaccard_distance('1111', '1111'); +SELECT jaccard_distance('1111', '1110'); +SELECT jaccard_distance('1111', '1100'); +SELECT jaccard_distance('1111', '1000'); +SELECT jaccard_distance('1111', '0000'); +SELECT jaccard_distance('1100', '1000'); +SELECT jaccard_distance('10101010101010101010', '01010101010101010101'); +SELECT jaccard_distance('101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101', '101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101'); +SELECT jaccard_distance('101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101', '010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010'); +SELECT jaccard_distance('110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011', '100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001'); +SELECT jaccard_distance('', ''); +SELECT jaccard_distance('1111', '000'); +SELECT jaccard_distance('1111', '0000'::varbit(5)); +SELECT jaccard_distance('1111', '00000'::varbit(5)); diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/btree.sql b/src/postgres/third-party-extensions/pgvector/test/sql/btree.sql index 232b28818116..de583c33c913 100644 --- a/src/postgres/third-party-extensions/pgvector/test/sql/btree.sql +++ b/src/postgres/third-party-extensions/pgvector/test/sql/btree.sql @@ -1,10 +1,34 @@ SET enable_seqscan = off; +-- vector + CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE INDEX ON t (val); SELECT * FROM t WHERE val = '[1,2,3]'; -SELECT * FROM t ORDER BY val LIMIT 1; +SELECT * FROM t ORDER BY val; + +DROP TABLE t; + +-- halfvec + +CREATE TABLE t (val halfvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t (val); + +SELECT * FROM t WHERE val = '[1,2,3]'; +SELECT * FROM t ORDER BY val; + +DROP TABLE t; + +-- sparsevec + +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{1:1,2:2,3:3}/3'), ('{1:1,2:1,3:1}/3'), (NULL); +CREATE INDEX ON t (val); + +SELECT * FROM t WHERE val = '{1:1,2:2,3:3}/3'; +SELECT * FROM t ORDER BY val; DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/cast.sql b/src/postgres/third-party-extensions/pgvector/test/sql/cast.sql index cb5c88094d3d..fe8393119caa 100644 --- a/src/postgres/third-party-extensions/pgvector/test/sql/cast.sql +++ b/src/postgres/third-party-extensions/pgvector/test/sql/cast.sql @@ -3,12 +3,77 @@ SELECT ARRAY[1.0,2.0,3.0]::vector; SELECT ARRAY[1,2,3]::float4[]::vector; SELECT ARRAY[1,2,3]::float8[]::vector; SELECT ARRAY[1,2,3]::numeric[]::vector; + +SELECT '[1,2,3]'::vector::real[]; + +SELECT '{1,2,3}'::real[]::vector; +SELECT '{1,2,3}'::real[]::vector(3); +SELECT '{1,2,3}'::real[]::vector(2); SELECT '{NULL}'::real[]::vector; SELECT '{NaN}'::real[]::vector; SELECT '{Infinity}'::real[]::vector; SELECT '{-Infinity}'::real[]::vector; SELECT '{}'::real[]::vector; -SELECT '[1,2,3]'::vector::real[]; +SELECT '{{1}}'::real[]::vector; + +SELECT '{1,2,3}'::double precision[]::vector; +SELECT '{1,2,3}'::double precision[]::vector(3); +SELECT '{1,2,3}'::double precision[]::vector(2); +SELECT '{4e38,-4e38}'::double precision[]::vector; +SELECT '{1e-46,-1e-46}'::double precision[]::vector; + +SELECT '[1,2,3]'::vector::halfvec; +SELECT '[1,2,3]'::vector::halfvec(3); +SELECT '[1,2,3]'::vector::halfvec(2); +SELECT '[65520]'::vector::halfvec; +SELECT '[1e-8]'::vector::halfvec; + +SELECT '[1,2,3]'::halfvec::vector; +SELECT '[1,2,3]'::halfvec::vector(3); +SELECT '[1,2,3]'::halfvec::vector(2); + +SELECT '{1,2,3}'::real[]::halfvec; +SELECT '{1,2,3}'::real[]::halfvec(3); +SELECT '{1,2,3}'::real[]::halfvec(2); +SELECT '{65520,-65520}'::real[]::halfvec; +SELECT '{1e-8,-1e-8}'::real[]::halfvec; + +SELECT '[0,1.5,0,3.5,0]'::vector::sparsevec; +SELECT '[0,1.5,0,3.5,0]'::vector::sparsevec(5); +SELECT '[0,1.5,0,3.5,0]'::vector::sparsevec(4); + +SELECT '{2:1.5,4:3.5}/5'::sparsevec::vector; +SELECT '{2:1.5,4:3.5}/5'::sparsevec::vector(5); +SELECT '{2:1.5,4:3.5}/5'::sparsevec::vector(4); +SELECT '{}/16001'::sparsevec::vector; + +SELECT '[0,1.5,0,3.5,0]'::halfvec::sparsevec; +SELECT '[0,1.5,0,3.5,0]'::halfvec::sparsevec(5); +SELECT '[0,1.5,0,3.5,0]'::halfvec::sparsevec(4); + +SELECT '{2:1.5,4:3.5}/5'::sparsevec::halfvec; +SELECT '{2:1.5,4:3.5}/5'::sparsevec::halfvec(5); +SELECT '{2:1.5,4:3.5}/5'::sparsevec::halfvec(4); +SELECT '{}/16001'::sparsevec::halfvec; +SELECT '{1:65520}/1'::sparsevec::halfvec; +SELECT '{1:1e-8}/1'::sparsevec::halfvec; + +SELECT ARRAY[1,0,2,0,3,0]::sparsevec; +SELECT ARRAY[1.0,0.0,2.0,0.0,3.0,0.0]::sparsevec; +SELECT ARRAY[1,0,2,0,3,0]::float4[]::sparsevec; +SELECT ARRAY[1,0,2,0,3,0]::float8[]::sparsevec; +SELECT ARRAY[1,0,2,0,3,0]::numeric[]::sparsevec; + +SELECT '{1,0,2,0,3,0}'::real[]::sparsevec; +SELECT '{1,0,2,0,3,0}'::real[]::sparsevec(6); +SELECT '{1,0,2,0,3,0}'::real[]::sparsevec(5); +SELECT '{NULL}'::real[]::sparsevec; +SELECT '{NaN}'::real[]::sparsevec; +SELECT '{Infinity}'::real[]::sparsevec; +SELECT '{-Infinity}'::real[]::sparsevec; +SELECT '{}'::real[]::sparsevec; +SELECT '{{1}}'::real[]::sparsevec; + SELECT array_agg(n)::vector FROM generate_series(1, 16001) n; SELECT array_to_vector(array_agg(n), 16001, false) FROM generate_series(1, 16001) n; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/copy.sql b/src/postgres/third-party-extensions/pgvector/test/sql/copy.sql index 28200901c572..2dff3ffb7fb7 100644 --- a/src/postgres/third-party-extensions/pgvector/test/sql/copy.sql +++ b/src/postgres/third-party-extensions/pgvector/test/sql/copy.sql @@ -1,10 +1,42 @@ +-- vector + CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE TABLE t2 (val vector(3)); -\copy t TO 'results/data.bin' WITH (FORMAT binary) -\copy t2 FROM 'results/data.bin' WITH (FORMAT binary) +\copy t TO 'results/vector.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/vector.bin' WITH (FORMAT binary) + +SELECT * FROM t2 ORDER BY val; + +DROP TABLE t; +DROP TABLE t2; + +-- halfvec + +CREATE TABLE t (val halfvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); + +CREATE TABLE t2 (val halfvec(3)); + +\copy t TO 'results/halfvec.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/halfvec.bin' WITH (FORMAT binary) + +SELECT * FROM t2 ORDER BY val; + +DROP TABLE t; +DROP TABLE t2; + +-- sparsevec + +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{1:1,2:2,3:3}/3'), ('{1:1,2:1,3:1}/3'), (NULL); + +CREATE TABLE t2 (val sparsevec(3)); + +\copy t TO 'results/sparsevec.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/sparsevec.bin' WITH (FORMAT binary) SELECT * FROM t2 ORDER BY val; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/functions.sql b/src/postgres/third-party-extensions/pgvector/test/sql/functions.sql deleted file mode 100644 index e4d33172f316..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/sql/functions.sql +++ /dev/null @@ -1,29 +0,0 @@ -SELECT '[1,2,3]'::vector + '[4,5,6]'; -SELECT '[3e38]'::vector + '[3e38]'; -SELECT '[1,2,3]'::vector - '[4,5,6]'; -SELECT '[-3e38]'::vector - '[3e38]'; - -SELECT vector_dims('[1,2,3]'); - -SELECT round(vector_norm('[1,1]')::numeric, 5); -SELECT vector_norm('[3,4]'); -SELECT vector_norm('[0,1]'); - -SELECT l2_distance('[0,0]', '[3,4]'); -SELECT l2_distance('[0,0]', '[0,1]'); -SELECT l2_distance('[1,2]', '[3]'); - -SELECT inner_product('[1,2]', '[3,4]'); -SELECT inner_product('[1,2]', '[3]'); - -SELECT cosine_distance('[1,2]', '[2,4]'); -SELECT cosine_distance('[1,2]', '[0,0]'); -SELECT cosine_distance('[1,1]', '[1,1]'); -SELECT cosine_distance('[1,1]', '[-1,-1]'); -SELECT cosine_distance('[1,2]', '[3]'); - -SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; -SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; -SELECT avg(v) FROM unnest(ARRAY[]::vector[]) v; -SELECT avg(v) FROM unnest(ARRAY['[1,2]'::vector, '[3]']) v; -SELECT vector_avg(array_agg(n)) FROM generate_series(1, 16002) n; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/halfvec.sql b/src/postgres/third-party-extensions/pgvector/test/sql/halfvec.sql new file mode 100644 index 000000000000..1a3fd1b825a7 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/sql/halfvec.sql @@ -0,0 +1,147 @@ +SELECT '[1,2,3]'::halfvec; +SELECT '[-1,-2,-3]'::halfvec; +SELECT '[1.,2.,3.]'::halfvec; +SELECT ' [ 1, 2 , 3 ] '::halfvec; +SELECT '[1.23456]'::halfvec; +SELECT '[hello,1]'::halfvec; +SELECT '[NaN,1]'::halfvec; +SELECT '[Infinity,1]'::halfvec; +SELECT '[-Infinity,1]'::halfvec; +SELECT '[65519,-65519]'::halfvec; +SELECT '[65520,-65520]'::halfvec; +SELECT '[1e-8,-1e-8]'::halfvec; +SELECT '[4e38,1]'::halfvec; +SELECT '[1e-46,1]'::halfvec; +SELECT '[1,2,3'::halfvec; +SELECT '[1,2,3]9'::halfvec; +SELECT '1,2,3'::halfvec; +SELECT ''::halfvec; +SELECT '['::halfvec; +SELECT '[ '::halfvec; +SELECT '[,'::halfvec; +SELECT '[]'::halfvec; +SELECT '[ ]'::halfvec; +SELECT '[,]'::halfvec; +SELECT '[1,]'::halfvec; +SELECT '[1a]'::halfvec; +SELECT '[1,,3]'::halfvec; +SELECT '[1, ,3]'::halfvec; + +SELECT '[1,2,3]'::halfvec(3); +SELECT '[1,2,3]'::halfvec(2); +SELECT '[1,2,3]'::halfvec(3, 2); +SELECT '[1,2,3]'::halfvec('a'); +SELECT '[1,2,3]'::halfvec(0); +SELECT '[1,2,3]'::halfvec(16001); + +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::halfvec[]); +SELECT '{"[1,2,3]"}'::halfvec(2)[]; + +SELECT '[1,2,3]'::halfvec + '[4,5,6]'; +SELECT '[65519]'::halfvec + '[65519]'; +SELECT '[1,2]'::halfvec + '[3]'; + +SELECT '[1,2,3]'::halfvec - '[4,5,6]'; +SELECT '[-65519]'::halfvec - '[65519]'; +SELECT '[1,2]'::halfvec - '[3]'; + +SELECT '[1,2,3]'::halfvec * '[4,5,6]'; +SELECT '[65519]'::halfvec * '[65519]'; +SELECT '[1e-7]'::halfvec * '[1e-7]'; +SELECT '[1,2]'::halfvec * '[3]'; + +SELECT '[1,2,3]'::halfvec || '[4,5]'; +SELECT array_fill(0, ARRAY[16000])::halfvec || '[1]'; + +SELECT '[1,2,3]'::halfvec < '[1,2,3]'; +SELECT '[1,2,3]'::halfvec < '[1,2]'; +SELECT '[1,2,3]'::halfvec <= '[1,2,3]'; +SELECT '[1,2,3]'::halfvec <= '[1,2]'; +SELECT '[1,2,3]'::halfvec = '[1,2,3]'; +SELECT '[1,2,3]'::halfvec = '[1,2]'; +SELECT '[1,2,3]'::halfvec != '[1,2,3]'; +SELECT '[1,2,3]'::halfvec != '[1,2]'; +SELECT '[1,2,3]'::halfvec >= '[1,2,3]'; +SELECT '[1,2,3]'::halfvec >= '[1,2]'; +SELECT '[1,2,3]'::halfvec > '[1,2,3]'; +SELECT '[1,2,3]'::halfvec > '[1,2]'; + +SELECT halfvec_cmp('[1,2,3]', '[1,2,3]'); +SELECT halfvec_cmp('[1,2,3]', '[0,0,0]'); +SELECT halfvec_cmp('[0,0,0]', '[1,2,3]'); +SELECT halfvec_cmp('[1,2]', '[1,2,3]'); +SELECT halfvec_cmp('[1,2,3]', '[1,2]'); +SELECT halfvec_cmp('[1,2]', '[2,3,4]'); +SELECT halfvec_cmp('[2,3]', '[1,2,3]'); + +SELECT vector_dims('[1,2,3]'::halfvec); + +SELECT round(l2_norm('[1,1]'::halfvec)::numeric, 5); +SELECT l2_norm('[3,4]'::halfvec); +SELECT l2_norm('[0,1]'::halfvec); +SELECT l2_norm('[0,0]'::halfvec); +SELECT l2_norm('[2]'::halfvec); + +SELECT l2_distance('[0,0]'::halfvec, '[3,4]'); +SELECT l2_distance('[0,0]'::halfvec, '[0,1]'); +SELECT l2_distance('[1,2]'::halfvec, '[3]'); +SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,1,1,1,1,1,1,4,5]'); +SELECT '[0,0]'::halfvec <-> '[3,4]'; + +SELECT inner_product('[1,2]'::halfvec, '[3,4]'); +SELECT inner_product('[1,2]'::halfvec, '[3]'); +SELECT inner_product('[65504]'::halfvec, '[65504]'); +SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); +SELECT '[1,2]'::halfvec <#> '[3,4]'; + +SELECT cosine_distance('[1,2]'::halfvec, '[2,4]'); +SELECT cosine_distance('[1,2]'::halfvec, '[0,0]'); +SELECT cosine_distance('[1,1]'::halfvec, '[1,1]'); +SELECT cosine_distance('[1,0]'::halfvec, '[0,2]'); +SELECT cosine_distance('[1,1]'::halfvec, '[-1,-1]'); +SELECT cosine_distance('[1,2]'::halfvec, '[3]'); +SELECT cosine_distance('[1,1]'::halfvec, '[1.1,1.1]'); +SELECT cosine_distance('[1,1]'::halfvec, '[-1.1,-1.1]'); +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); +SELECT '[1,2]'::halfvec <=> '[2,4]'; + +SELECT l1_distance('[0,0]'::halfvec, '[3,4]'); +SELECT l1_distance('[0,0]'::halfvec, '[0,1]'); +SELECT l1_distance('[1,2]'::halfvec, '[3]'); +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[1,2,3,4,5,6,7,8,9]'); +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::halfvec, '[0,3,2,5,4,7,6,9,8]'); +SELECT '[0,0]'::halfvec <+> '[3,4]'; + +SELECT l2_normalize('[3,4]'::halfvec); +SELECT l2_normalize('[3,0]'::halfvec); +SELECT l2_normalize('[0,0.1]'::halfvec); +SELECT l2_normalize('[0,0]'::halfvec); +SELECT l2_normalize('[65504]'::halfvec); + +SELECT binary_quantize('[1,0,-1]'::halfvec); +SELECT binary_quantize('[0,0.1,-0.2,-0.3,0.4,0.5,0.6,-0.7,0.8,-0.9,1]'::halfvec); + +SELECT subvector('[1,2,3,4,5]'::halfvec, 1, 3); +SELECT subvector('[1,2,3,4,5]'::halfvec, 3, 2); +SELECT subvector('[1,2,3,4,5]'::halfvec, -1, 3); +SELECT subvector('[1,2,3,4,5]'::halfvec, 3, 9); +SELECT subvector('[1,2,3,4,5]'::halfvec, 1, 0); +SELECT subvector('[1,2,3,4,5]'::halfvec, 3, -1); +SELECT subvector('[1,2,3,4,5]'::halfvec, -1, 2); +SELECT subvector('[1,2,3,4,5]'::halfvec, 2147483647, 10); +SELECT subvector('[1,2,3,4,5]'::halfvec, 3, 2147483647); +SELECT subvector('[1,2,3,4,5]'::halfvec, -2147483644, 2147483647); + +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::halfvec, '[3,5,7]']) v; +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::halfvec, '[3,5,7]', NULL]) v; +SELECT avg(v) FROM unnest(ARRAY[]::halfvec[]) v; +SELECT avg(v) FROM unnest(ARRAY['[1,2]'::halfvec, '[3]']) v; +SELECT avg(v) FROM unnest(ARRAY['[65504]'::halfvec, '[65504]']) v; +SELECT halfvec_avg(array_agg(n)) FROM generate_series(1, 16002) n; + +SELECT sum(v) FROM unnest(ARRAY['[1,2,3]'::halfvec, '[3,5,7]']) v; +SELECT sum(v) FROM unnest(ARRAY['[1,2,3]'::halfvec, '[3,5,7]', NULL]) v; +SELECT sum(v) FROM unnest(ARRAY[]::halfvec[]) v; +SELECT sum(v) FROM unnest(ARRAY['[1,2]'::halfvec, '[3]']) v; +SELECT sum(v) FROM unnest(ARRAY['[65504]'::halfvec, '[65504]']) v; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/input.sql b/src/postgres/third-party-extensions/pgvector/test/sql/input.sql deleted file mode 100644 index a4ad08d8bb66..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/sql/input.sql +++ /dev/null @@ -1,27 +0,0 @@ -SELECT '[1,2,3]'::vector; -SELECT '[-1,-2,-3]'::vector; -SELECT '[1.,2.,3.]'::vector; -SELECT ' [ 1, 2 , 3 ] '::vector; -SELECT '[1.23456]'::vector; -SELECT '[hello,1]'::vector; -SELECT '[NaN,1]'::vector; -SELECT '[Infinity,1]'::vector; -SELECT '[-Infinity,1]'::vector; -SELECT '[1.5e38,-1.5e38]'::vector; -SELECT '[1.5e+38,-1.5e+38]'::vector; -SELECT '[1.5e-38,-1.5e-38]'::vector; -SELECT '[4e38,1]'::vector; -SELECT '[1,2,3'::vector; -SELECT '[1,2,3]9'::vector; -SELECT '1,2,3'::vector; -SELECT '['::vector; -SELECT '[,'::vector; -SELECT '[]'::vector; -SELECT '[1,]'::vector; -SELECT '[1a]'::vector; -SELECT '[1,,3]'::vector; -SELECT '[1, ,3]'::vector; -SELECT '[1,2,3]'::vector(2); - -SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::vector[]); -SELECT '{"[1,2,3]"}'::vector(2)[]; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_cosine.sql b/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_cosine.sql deleted file mode 100644 index 1fec6cfd9ee9..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_cosine.sql +++ /dev/null @@ -1,12 +0,0 @@ -SET enable_seqscan = off; - -CREATE TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE INDEX ON t USING ivfflat (val vector_cosine_ops) WITH (lists = 1); - -INSERT INTO t (val) VALUES ('[1,2,4]'); - -SELECT * FROM t ORDER BY val <=> '[3,3,3]'; -SELECT * FROM t ORDER BY val <=> (SELECT NULL::vector); - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_ip.sql b/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_ip.sql deleted file mode 100644 index 46daa4e5caef..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_ip.sql +++ /dev/null @@ -1,12 +0,0 @@ -SET enable_seqscan = off; - -CREATE TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE INDEX ON t USING ivfflat (val vector_ip_ops) WITH (lists = 1); - -INSERT INTO t (val) VALUES ('[1,2,4]'); - -SELECT * FROM t ORDER BY val <#> '[3,3,3]'; -SELECT * FROM t ORDER BY val <#> (SELECT NULL::vector); - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_l2.sql b/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_l2.sql deleted file mode 100644 index 9349572ab84f..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_l2.sql +++ /dev/null @@ -1,13 +0,0 @@ -SET enable_seqscan = off; - -CREATE TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE INDEX ON t USING ivfflat (val) WITH (lists = 1); - -INSERT INTO t (val) VALUES ('[1,2,4]'); - -SELECT * FROM t ORDER BY val <-> '[3,3,3]'; -SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); -SELECT COUNT(*) FROM t; - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_options.sql b/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_options.sql deleted file mode 100644 index d8dc45c611cd..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_options.sql +++ /dev/null @@ -1,9 +0,0 @@ -SET enable_seqscan = off; - -CREATE TABLE t (val vector(3)); -CREATE INDEX ON t USING ivfflat (val) WITH (lists = 0); -CREATE INDEX ON t USING ivfflat (val) WITH (lists = 32769); - -SHOW ivfflat.probes; - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_unlogged.sql b/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_unlogged.sql deleted file mode 100644 index ca4c6ba9ca67..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/sql/ivfflat_unlogged.sql +++ /dev/null @@ -1,9 +0,0 @@ -SET enable_seqscan = off; - -CREATE UNLOGGED TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE INDEX ON t USING ivfflat (val) WITH (lists = 1); - -SELECT * FROM t ORDER BY val <-> '[3,3,3]'; - -DROP TABLE t; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/sparsevec.sql b/src/postgres/third-party-extensions/pgvector/test/sql/sparsevec.sql new file mode 100644 index 000000000000..2e1136b8d5ad --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/sql/sparsevec.sql @@ -0,0 +1,134 @@ +SELECT '{1:1.5,3:3.5}/5'::sparsevec; +SELECT '{1:-2,3:-4}/5'::sparsevec; +SELECT '{1:2.,3:4.}/5'::sparsevec; +SELECT ' { 1 : 1.5 , 3 : 3.5 } / 5 '::sparsevec; +SELECT '{1:1.23456}/1'::sparsevec; +SELECT '{1:hello,2:1}/2'::sparsevec; +SELECT '{1:NaN,2:1}/2'::sparsevec; +SELECT '{1:Infinity,2:1}/2'::sparsevec; +SELECT '{1:-Infinity,2:1}/2'::sparsevec; +SELECT '{1:1.5e38,2:-1.5e38}/2'::sparsevec; +SELECT '{1:1.5e+38,2:-1.5e+38}/2'::sparsevec; +SELECT '{1:1.5e-38,2:-1.5e-38}/2'::sparsevec; +SELECT '{1:4e38,2:1}/2'::sparsevec; +SELECT '{1:-4e38,2:1}/2'::sparsevec; +SELECT '{1:1e-46,2:1}/2'::sparsevec; +SELECT '{1:-1e-46,2:1}/2'::sparsevec; +SELECT ''::sparsevec; +SELECT '{'::sparsevec; +SELECT '{ '::sparsevec; +SELECT '{:'::sparsevec; +SELECT '{,'::sparsevec; +SELECT '{}'::sparsevec; +SELECT '{}/'::sparsevec; +SELECT '{}/1'::sparsevec; +SELECT '{}/1a'::sparsevec; +SELECT '{ }/1'::sparsevec; +SELECT '{:}/1'::sparsevec; +SELECT '{,}/1'::sparsevec; +SELECT '{1,}/1'::sparsevec; +SELECT '{:1}/1'::sparsevec; +SELECT '{1:}/1'::sparsevec; +SELECT '{1a:1}/1'::sparsevec; +SELECT '{1:1a}/1'::sparsevec; +SELECT '{1:1,}/1'::sparsevec; +SELECT '{1:0,2:1,3:0}/3'::sparsevec; +SELECT '{2:1,1:1}/2'::sparsevec; +SELECT '{1:1,1:1}/2'::sparsevec; +SELECT '{1:1,2:1,1:1}/2'::sparsevec; +SELECT '{}/5'::sparsevec; +SELECT '{}/-1'::sparsevec; +SELECT '{}/1000000001'::sparsevec; +SELECT '{}/2147483648'::sparsevec; +SELECT '{}/-2147483649'::sparsevec; +SELECT '{}/9223372036854775808'::sparsevec; +SELECT '{}/-9223372036854775809'::sparsevec; +SELECT '{2147483647:1}/1'::sparsevec; +SELECT '{2147483648:1}/1'::sparsevec; +SELECT '{-2147483648:1}/1'::sparsevec; +SELECT '{-2147483649:1}/1'::sparsevec; +SELECT '{0:1}/1'::sparsevec; +SELECT '{2:1}/1'::sparsevec; + +SELECT '{}/3'::sparsevec(3); +SELECT '{}/3'::sparsevec(2); +SELECT '{}/3'::sparsevec(3, 2); +SELECT '{}/3'::sparsevec('a'); +SELECT '{}/3'::sparsevec(0); +SELECT '{}/3'::sparsevec(1000000001); + +SELECT '{1:1,2:2,3:3}/3'::sparsevec < '{1:1,2:2,3:3}/3'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec < '{1:1,2:2}/2'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec <= '{1:1,2:2,3:3}/3'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec <= '{1:1,2:2}/2'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec = '{1:1,2:2,3:3}/3'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec = '{1:1,2:2}/2'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec != '{1:1,2:2,3:3}/3'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec != '{1:1,2:2}/2'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec >= '{1:1,2:2,3:3}/3'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec >= '{1:1,2:2}/2'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec > '{1:1,2:2,3:3}/3'; +SELECT '{1:1,2:2,3:3}/3'::sparsevec > '{1:1,2:2}/2'; + +SELECT sparsevec_cmp('{1:1,2:2,3:3}/3', '{1:1,2:2,3:3}/3'); +SELECT sparsevec_cmp('{1:1,2:2,3:3}/3', '{}/3'); +SELECT sparsevec_cmp('{}/3', '{1:1,2:2,3:3}/3'); +SELECT sparsevec_cmp('{1:1,2:2}/2', '{1:1,2:2,3:3}/3'); +SELECT sparsevec_cmp('{1:1,2:2,3:3}/3', '{1:1,2:2}/2'); +SELECT sparsevec_cmp('{1:1,2:2}/2', '{1:2,2:3,3:4}/3'); +SELECT sparsevec_cmp('{1:2,2:3}/2', '{1:1,2:2,3:3}/3'); + +SELECT round(l2_norm('{1:1,2:1}/2'::sparsevec)::numeric, 5); +SELECT l2_norm('{1:3,2:4}/2'::sparsevec); +SELECT l2_norm('{2:1}/2'::sparsevec); +SELECT l2_norm('{1:3e37,2:4e37}/2'::sparsevec)::real; +SELECT l2_norm('{}/2'::sparsevec); +SELECT l2_norm('{1:2}/1'::sparsevec); + +SELECT l2_distance('{}/2'::sparsevec, '{1:3,2:4}/2'); +SELECT l2_distance('{1:3}/2'::sparsevec, '{2:4}/2'); +SELECT l2_distance('{2:4}/2'::sparsevec, '{1:3}/2'); +SELECT l2_distance('{1:3,2:4}/2'::sparsevec, '{}/2'); +SELECT l2_distance('{}/2'::sparsevec, '{2:1}/2'); +SELECT '{}/2'::sparsevec <-> '{1:3,2:4}/2'; + +SELECT inner_product('{1:1,2:2}/2'::sparsevec, '{1:2,2:4}/2'); +SELECT inner_product('{1:1,2:2}/2'::sparsevec, '{1:3}/1'); +SELECT inner_product('{1:1,3:3}/4'::sparsevec, '{2:2,4:4}/4'); +SELECT inner_product('{2:2,4:4}/4'::sparsevec, '{1:1,3:3}/4'); +SELECT inner_product('{1:1,3:3,5:5}/5'::sparsevec, '{2:4,3:6,4:8}/5'); +SELECT inner_product('{1:1}/2'::sparsevec, '{}/2'); +SELECT inner_product('{}/2'::sparsevec, '{1:1}/2'); +SELECT inner_product('{1:3e38}/1'::sparsevec, '{1:3e38}/1'); +SELECT inner_product('{1:1,3:3,5:5}/5'::sparsevec, '{2:4,3:6,4:8}/5'); +SELECT '{1:1,2:2}/2'::sparsevec <#> '{1:3,2:4}/2'; + +SELECT cosine_distance('{1:1,2:2}/2'::sparsevec, '{1:2,2:4}/2'); +SELECT cosine_distance('{1:1,2:2}/2'::sparsevec, '{}/2'); +SELECT cosine_distance('{1:1,2:1}/2'::sparsevec, '{1:1,2:1}/2'); +SELECT cosine_distance('{1:1}/2'::sparsevec, '{2:2}/2'); +SELECT cosine_distance('{1:1,2:1}/2'::sparsevec, '{1:-1,2:-1}/2'); +SELECT cosine_distance('{1:2}/2'::sparsevec, '{2:2}/2'); +SELECT cosine_distance('{2:2}/2'::sparsevec, '{1:2}/2'); +SELECT cosine_distance('{1:1,2:2}/2'::sparsevec, '{1:3}/1'); +SELECT cosine_distance('{1:1,2:1}/2'::sparsevec, '{1:1.1,2:1.1}/2'); +SELECT cosine_distance('{1:1,2:1}/2'::sparsevec, '{1:-1.1,2:-1.1}/2'); +SELECT cosine_distance('{1:3e38}/1'::sparsevec, '{1:3e38}/1'); +SELECT cosine_distance('{}/1'::sparsevec, '{}/1'); +SELECT '{1:1,2:2}/2'::sparsevec <=> '{1:2,2:4}/2'; + +SELECT l1_distance('{}/2'::sparsevec, '{1:3,2:4}/2'); +SELECT l1_distance('{}/2'::sparsevec, '{2:1}/2'); +SELECT l1_distance('{1:1,2:2}/2'::sparsevec, '{1:3}/1'); +SELECT l1_distance('{1:3e38}/1'::sparsevec, '{1:-3e38}/1'); +SELECT l1_distance('{1:1,3:3,5:5,7:7}/8'::sparsevec, '{2:2,4:4,6:6,8:8}/8'); +SELECT l1_distance('{1:1,3:3,5:5,7:7,9:9}/9'::sparsevec, '{2:2,4:4,6:6,8:8}/9'); +SELECT '{}/2'::sparsevec <+> '{1:3,2:4}/2'; + +SELECT l2_normalize('{1:3,2:4}/2'::sparsevec); +SELECT l2_normalize('{1:3}/2'::sparsevec); +SELECT l2_normalize('{2:0.1}/2'::sparsevec); +SELECT l2_normalize('{}/2'::sparsevec); +SELECT l2_normalize('{1:3e38}/1'::sparsevec); +SELECT l2_normalize('{1:3e38,2:1e-37}/2'::sparsevec); +SELECT l2_normalize('{2:3e37,4:3e-37,6:4e37,8:4e-37}/9'::sparsevec); diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/vector_type.sql b/src/postgres/third-party-extensions/pgvector/test/sql/vector_type.sql new file mode 100644 index 000000000000..088b040aaa88 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/sql/vector_type.sql @@ -0,0 +1,154 @@ +SELECT '[1,2,3]'::vector; +SELECT '[-1,-2,-3]'::vector; +SELECT '[1.,2.,3.]'::vector; +SELECT ' [ 1, 2 , 3 ] '::vector; +SELECT '[1.23456]'::vector; +SELECT '[hello,1]'::vector; +SELECT '[NaN,1]'::vector; +SELECT '[Infinity,1]'::vector; +SELECT '[-Infinity,1]'::vector; +SELECT '[1.5e38,-1.5e38]'::vector; +SELECT '[1.5e+38,-1.5e+38]'::vector; +SELECT '[1.5e-38,-1.5e-38]'::vector; +SELECT '[4e38,1]'::vector; +SELECT '[-4e38,1]'::vector; +SELECT '[1e-46,1]'::vector; +SELECT '[-1e-46,1]'::vector; +SELECT '[1,2,3'::vector; +SELECT '[1,2,3]9'::vector; +SELECT '1,2,3'::vector; +SELECT ''::vector; +SELECT '['::vector; +SELECT '[ '::vector; +SELECT '[,'::vector; +SELECT '[]'::vector; +SELECT '[ ]'::vector; +SELECT '[,]'::vector; +SELECT '[1,]'::vector; +SELECT '[1a]'::vector; +SELECT '[1,,3]'::vector; +SELECT '[1, ,3]'::vector; + +SELECT '[1,2,3]'::vector(3); +SELECT '[1,2,3]'::vector(2); +SELECT '[1,2,3]'::vector(3, 2); +SELECT '[1,2,3]'::vector('a'); +SELECT '[1,2,3]'::vector(0); +SELECT '[1,2,3]'::vector(16001); + +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::vector[]); +SELECT '{"[1,2,3]"}'::vector(2)[]; + + +SELECT '[1,2,3]'::vector + '[4,5,6]'; +SELECT '[3e38]'::vector + '[3e38]'; +SELECT '[1,2]'::vector + '[3]'; + +SELECT '[1,2,3]'::vector - '[4,5,6]'; +SELECT '[-3e38]'::vector - '[3e38]'; +SELECT '[1,2]'::vector - '[3]'; + +SELECT '[1,2,3]'::vector * '[4,5,6]'; +SELECT '[1e37]'::vector * '[1e37]'; +SELECT '[1e-37]'::vector * '[1e-37]'; +SELECT '[1,2]'::vector * '[3]'; + +SELECT '[1,2,3]'::vector || '[4,5]'; +SELECT array_fill(0, ARRAY[16000])::vector || '[1]'; + +SELECT '[1,2,3]'::vector < '[1,2,3]'; +SELECT '[1,2,3]'::vector < '[1,2]'; +SELECT '[1,2,3]'::vector <= '[1,2,3]'; +SELECT '[1,2,3]'::vector <= '[1,2]'; +SELECT '[1,2,3]'::vector = '[1,2,3]'; +SELECT '[1,2,3]'::vector = '[1,2]'; +SELECT '[1,2,3]'::vector != '[1,2,3]'; +SELECT '[1,2,3]'::vector != '[1,2]'; +SELECT '[1,2,3]'::vector >= '[1,2,3]'; +SELECT '[1,2,3]'::vector >= '[1,2]'; +SELECT '[1,2,3]'::vector > '[1,2,3]'; +SELECT '[1,2,3]'::vector > '[1,2]'; + +SELECT vector_cmp('[1,2,3]', '[1,2,3]'); +SELECT vector_cmp('[1,2,3]', '[0,0,0]'); +SELECT vector_cmp('[0,0,0]', '[1,2,3]'); +SELECT vector_cmp('[1,2]', '[1,2,3]'); +SELECT vector_cmp('[1,2,3]', '[1,2]'); +SELECT vector_cmp('[1,2]', '[2,3,4]'); +SELECT vector_cmp('[2,3]', '[1,2,3]'); + +SELECT vector_dims('[1,2,3]'::vector); + +SELECT round(vector_norm('[1,1]')::numeric, 5); +SELECT vector_norm('[3,4]'); +SELECT vector_norm('[0,1]'); +SELECT vector_norm('[3e37,4e37]')::real; +SELECT vector_norm('[0,0]'); +SELECT vector_norm('[2]'); + +SELECT l2_distance('[0,0]'::vector, '[3,4]'); +SELECT l2_distance('[0,0]'::vector, '[0,1]'); +SELECT l2_distance('[1,2]'::vector, '[3]'); +SELECT l2_distance('[3e38]'::vector, '[-3e38]'); +SELECT l2_distance('[1,1,1,1,1,1,1,1,1]'::vector, '[1,1,1,1,1,1,1,4,5]'); +SELECT '[0,0]'::vector <-> '[3,4]'; + +SELECT inner_product('[1,2]'::vector, '[3,4]'); +SELECT inner_product('[1,2]'::vector, '[3]'); +SELECT inner_product('[3e38]'::vector, '[3e38]'); +SELECT inner_product('[1,1,1,1,1,1,1,1,1]'::vector, '[1,2,3,4,5,6,7,8,9]'); +SELECT '[1,2]'::vector <#> '[3,4]'; + +SELECT cosine_distance('[1,2]'::vector, '[2,4]'); +SELECT cosine_distance('[1,2]'::vector, '[0,0]'); +SELECT cosine_distance('[1,1]'::vector, '[1,1]'); +SELECT cosine_distance('[1,0]'::vector, '[0,2]'); +SELECT cosine_distance('[1,1]'::vector, '[-1,-1]'); +SELECT cosine_distance('[1,2]'::vector, '[3]'); +SELECT cosine_distance('[1,1]'::vector, '[1.1,1.1]'); +SELECT cosine_distance('[1,1]'::vector, '[-1.1,-1.1]'); +SELECT cosine_distance('[3e38]'::vector, '[3e38]'); +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::vector, '[1,2,3,4,5,6,7,8,9]'); +SELECT cosine_distance('[1,2,3,4,5,6,7,8,9]'::vector, '[-1,-2,-3,-4,-5,-6,-7,-8,-9]'); +SELECT '[1,2]'::vector <=> '[2,4]'; + +SELECT l1_distance('[0,0]'::vector, '[3,4]'); +SELECT l1_distance('[0,0]'::vector, '[0,1]'); +SELECT l1_distance('[1,2]'::vector, '[3]'); +SELECT l1_distance('[3e38]'::vector, '[-3e38]'); +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::vector, '[1,2,3,4,5,6,7,8,9]'); +SELECT l1_distance('[1,2,3,4,5,6,7,8,9]'::vector, '[0,3,2,5,4,7,6,9,8]'); +SELECT '[0,0]'::vector <+> '[3,4]'; + +SELECT l2_normalize('[3,4]'::vector); +SELECT l2_normalize('[3,0]'::vector); +SELECT l2_normalize('[0,0.1]'::vector); +SELECT l2_normalize('[0,0]'::vector); +SELECT l2_normalize('[3e38]'::vector); + +SELECT binary_quantize('[1,0,-1]'::vector); +SELECT binary_quantize('[0,0.1,-0.2,-0.3,0.4,0.5,0.6,-0.7,0.8,-0.9,1]'::vector); + +SELECT subvector('[1,2,3,4,5]'::vector, 1, 3); +SELECT subvector('[1,2,3,4,5]'::vector, 3, 2); +SELECT subvector('[1,2,3,4,5]'::vector, -1, 3); +SELECT subvector('[1,2,3,4,5]'::vector, 3, 9); +SELECT subvector('[1,2,3,4,5]'::vector, 1, 0); +SELECT subvector('[1,2,3,4,5]'::vector, 3, -1); +SELECT subvector('[1,2,3,4,5]'::vector, -1, 2); +SELECT subvector('[1,2,3,4,5]'::vector, 2147483647, 10); +SELECT subvector('[1,2,3,4,5]'::vector, 3, 2147483647); +SELECT subvector('[1,2,3,4,5]'::vector, -2147483644, 2147483647); + +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; +SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; +SELECT avg(v) FROM unnest(ARRAY[]::vector[]) v; +SELECT avg(v) FROM unnest(ARRAY['[1,2]'::vector, '[3]']) v; +SELECT avg(v) FROM unnest(ARRAY['[3e38]'::vector, '[3e38]']) v; +SELECT vector_avg(array_agg(n)) FROM generate_series(1, 16002) n; + +SELECT sum(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; +SELECT sum(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; +SELECT sum(v) FROM unnest(ARRAY[]::vector[]) v; +SELECT sum(v) FROM unnest(ARRAY['[1,2]'::vector, '[3]']) v; +SELECT sum(v) FROM unnest(ARRAY['[3e38]'::vector, '[3e38]']) v; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/yb.orig.setup.sql b/src/postgres/third-party-extensions/pgvector/test/sql/yb.orig.setup.sql index 05a495838412..c456f5eb72ea 100644 --- a/src/postgres/third-party-extensions/pgvector/test/sql/yb.orig.setup.sql +++ b/src/postgres/third-party-extensions/pgvector/test/sql/yb.orig.setup.sql @@ -1 +1 @@ -CREATE EXTENSION vector VERSION '0.4.4-yb-1.2'; +CREATE EXTENSION vector VERSION '0.8.0-yb-1.0'; diff --git a/src/postgres/third-party-extensions/pgvector/test/sql/yb.port.copy.sql b/src/postgres/third-party-extensions/pgvector/test/sql/yb.port.copy.sql new file mode 100644 index 000000000000..a09745f1b514 --- /dev/null +++ b/src/postgres/third-party-extensions/pgvector/test/sql/yb.port.copy.sql @@ -0,0 +1,44 @@ +-- vector + +CREATE TABLE t (val vector(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); + +CREATE TABLE t2 (val vector(3)); + +\copy t TO '/tmp/vector.bin' WITH (FORMAT binary) +\copy t2 FROM '/tmp/vector.bin' WITH (FORMAT binary) + +SELECT * FROM t2 ORDER BY val; + +DROP TABLE t; +DROP TABLE t2; + +-- halfvec + +CREATE TABLE t (val halfvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); + +CREATE TABLE t2 (val halfvec(3)); + +\copy t TO '/tmp/halfvec.bin' WITH (FORMAT binary) +\copy t2 FROM '/tmp/halfvec.bin' WITH (FORMAT binary) + +SELECT * FROM t2 ORDER BY val; + +DROP TABLE t; +DROP TABLE t2; + +-- sparsevec + +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{1:1,2:2,3:3}/3'), ('{1:1,2:1,3:1}/3'), (NULL); + +CREATE TABLE t2 (val sparsevec(3)); + +\copy t TO '/tmp/sparsevec.bin' WITH (FORMAT binary) +\copy t2 FROM '/tmp/sparsevec.bin' WITH (FORMAT binary) + +SELECT * FROM t2 ORDER BY val; + +DROP TABLE t; +DROP TABLE t2; diff --git a/src/postgres/third-party-extensions/pgvector/test/t/001_wal.pl b/src/postgres/third-party-extensions/pgvector/test/t/001_wal.pl deleted file mode 100644 index 46060ede6523..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/001_wal.pl +++ /dev/null @@ -1,97 +0,0 @@ -# Based on postgres/contrib/bloom/t/001_wal.pl - -# Test generic xlog record work for ivfflat index replication. -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 31; - -my $dim = 32; - -my $node_primary; -my $node_replica; - -# Run few queries on both primary and replica and check their results match. -sub test_index_replay -{ - my ($test_name) = @_; - - # Wait for replica to catch up - my $applname = $node_replica->name; - - my $server_version_num = $node_primary->safe_psql("postgres", "SHOW server_version_num"); - my $caughtup_query = "SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$applname';"; - $node_primary->poll_query_until('postgres', $caughtup_query) - or die "Timed out while waiting for replica 1 to catch up"; - - my @r = (); - for (1 .. $dim) { - push(@r, rand()); - } - my $sql = join(",", @r); - - my $queries = qq( - SET enable_seqscan = off; - SELECT * FROM tst ORDER BY v <-> '[$sql]' LIMIT 10; - ); - - # Run test queries and compare their result - my $primary_result = $node_primary->safe_psql("postgres", $queries); - my $replica_result = $node_replica->safe_psql("postgres", $queries); - - is($primary_result, $replica_result, "$test_name: query result matches"); - return; -} - -# Use ARRAY[random(), random(), random(), ...] over -# SELECT array_agg(random()) FROM generate_series(1, $dim) -# to generate different values for each row -my $array_sql = join(",", ('random()') x $dim); - -# Initialize primary node -$node_primary = get_new_node('primary'); -$node_primary->init(allows_streaming => 1); -if ($dim > 32) { - # TODO use wal_keep_segments for Postgres < 13 - $node_primary->append_conf('postgresql.conf', qq(wal_keep_size = 1GB)); -} -if ($dim > 1500) { - $node_primary->append_conf('postgresql.conf', qq(maintenance_work_mem = 128MB)); -} -$node_primary->start; -my $backup_name = 'my_backup'; - -# Take backup -$node_primary->backup($backup_name); - -# Create streaming replica linking to primary -$node_replica = get_new_node('replica'); -$node_replica->init_from_backup($node_primary, $backup_name, - has_streaming => 1); -$node_replica->start; - -# Create ivfflat index on primary -$node_primary->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node_primary->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));"); -$node_primary->safe_psql("postgres", - "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 100000) i;" -); -$node_primary->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); - -# Test that queries give same result -test_index_replay('initial'); - -# Run 10 cycles of table modification. Run test queries after each modification. -for my $i (1 .. 10) -{ - $node_primary->safe_psql("postgres", "DELETE FROM tst WHERE i = $i;"); - test_index_replay("delete $i"); - $node_primary->safe_psql("postgres", "VACUUM tst;"); - test_index_replay("vacuum $i"); - my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000); - $node_primary->safe_psql("postgres", - "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series($start, $end) i;" - ); - test_index_replay("insert $i"); -} diff --git a/src/postgres/third-party-extensions/pgvector/test/t/002_vacuum.pl b/src/postgres/third-party-extensions/pgvector/test/t/002_vacuum.pl deleted file mode 100644 index 1c3d718469bf..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/002_vacuum.pl +++ /dev/null @@ -1,41 +0,0 @@ -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 1; - -my $dim = 3; - -my @r = (); -for (1 .. $dim) { - my $v = int(rand(1000)) + 1; - push(@r, "i % $v"); -} -my $array_sql = join(", ", @r); - -# Initialize node -my $node = get_new_node('node'); -$node->init; -$node->start; - -# Create table and index -$node->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));"); -$node->safe_psql("postgres", - "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 100000) i;" -); -$node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); - -# Get size -my $size = $node->safe_psql("postgres", "SELECT pg_total_relation_size('tst_v_idx');"); - -# Delete all, vacuum, and insert same data -$node->safe_psql("postgres", "DELETE FROM tst;"); -$node->safe_psql("postgres", "VACUUM tst;"); -$node->safe_psql("postgres", - "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 100000) i;" -); - -# Check size -my $new_size = $node->safe_psql("postgres", "SELECT pg_total_relation_size('tst_v_idx');"); -is($size, $new_size, "size does not change"); diff --git a/src/postgres/third-party-extensions/pgvector/test/t/003_recall.pl b/src/postgres/third-party-extensions/pgvector/test/t/003_recall.pl deleted file mode 100644 index dddc4d5e0764..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/003_recall.pl +++ /dev/null @@ -1,88 +0,0 @@ -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 9; - -my $node; -my @queries = (); -my @expected; -my $limit = 20; - -sub test_recall -{ - my ($probes, $min, $operator) = @_; - my $correct = 0; - my $total = 0; - - for my $i (0 .. $#queries) { - my $actual = $node->safe_psql("postgres", qq( - SET enable_seqscan = off; - SET ivfflat.probes = $probes; - SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit; - )); - my @actual_ids = split("\n", $actual); - my %actual_set = map { $_ => 1 } @actual_ids; - - my @expected_ids = split("\n", $expected[$i]); - - foreach (@expected_ids) { - if (exists($actual_set{$_})) { - $correct++; - } - $total++; - } - } - - cmp_ok($correct / $total, ">=", $min, $operator); -} - -# Initialize node -$node = get_new_node('node'); -$node->init; -$node->start; - -# Create table -$node->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector(3));"); -$node->safe_psql("postgres", - "INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" -); - -# Generate queries -for (1..20) { - my $r1 = rand(); - my $r2 = rand(); - my $r3 = rand(); - push(@queries, "[$r1,$r2,$r3]"); -} - -# Check each index type -my @operators = ("<->", "<#>", "<=>"); - -foreach (@operators) { - my $operator = $_; - - # Get exact results - @expected = (); - foreach (@queries) { - my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v $operator '$_' LIMIT $limit;"); - push(@expected, $res); - } - - # Add index - my $opclass; - if ($operator == "<->") { - $opclass = "vector_l2_ops"; - } elsif ($operator == "<#>") { - $opclass = "vector_ip_ops"; - } else { - $opclass = "vector_cosine_ops"; - } - $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v $opclass);"); - - # Test approximate results - test_recall(1, 0.75, $operator); - test_recall(10, 0.95, $operator); - test_recall(100, 1.0, $operator); -} diff --git a/src/postgres/third-party-extensions/pgvector/test/t/004_centers.pl b/src/postgres/third-party-extensions/pgvector/test/t/004_centers.pl deleted file mode 100644 index 9c2b53a5a716..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/004_centers.pl +++ /dev/null @@ -1,36 +0,0 @@ -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 3; - -# Initialize node -my $node = get_new_node('node'); -$node->init; -$node->start; - -# Create table -$node->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector(3));"); -$node->safe_psql("postgres", - "INSERT INTO tst SELECT i, '[1,2,3]' FROM generate_series(1, 10) i;" -); - -sub test_centers -{ - my ($lists, $min) = @_; - - my ($ret, $stdout, $stderr) = $node->psql("postgres", "CREATE INDEX ON tst USING ivfflat (v) WITH (lists = $lists);"); - is($ret, 0, $stderr); -} - -# Test no error for duplicate centers -test_centers(5); -test_centers(10); - -$node->safe_psql("postgres", - "INSERT INTO tst SELECT i, '[4,5,6]' FROM generate_series(1, 10) i;" -); - -# Test no error for duplicate centers -test_centers(10); diff --git a/src/postgres/third-party-extensions/pgvector/test/t/005_query_recall.pl b/src/postgres/third-party-extensions/pgvector/test/t/005_query_recall.pl deleted file mode 100644 index 0e58135a383a..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/005_query_recall.pl +++ /dev/null @@ -1,45 +0,0 @@ -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 60; - -# Initialize node -my $node = get_new_node('node'); -$node->init; -$node->start; - -# Create table -$node->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node->safe_psql("postgres", "CREATE TABLE tst (i int4 primary key, v vector(3));"); -$node->safe_psql("postgres", - "INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" -); - -# Check each index type -my @operators = ("<->", "<#>", "<=>"); -foreach (@operators) { - my $operator = $_; - - # Add index - my $opclass; - if ($operator == "<->") { - $opclass = "vector_l2_ops"; - } elsif ($operator == "<#>") { - $opclass = "vector_ip_ops"; - } else { - $opclass = "vector_cosine_ops"; - } - $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v $opclass);"); - - # Test 100% recall - for (1..20) { - my $i = int(rand() * 100000); - my $query = $node->safe_psql("postgres", "SELECT v FROM tst WHERE i = $i;"); - my $res = $node->safe_psql("postgres", qq( - SET enable_seqscan = off; - SELECT v FROM tst ORDER BY v <-> '$query' LIMIT 1; - )); - is($res, $query); - } -} diff --git a/src/postgres/third-party-extensions/pgvector/test/t/006_lists.pl b/src/postgres/third-party-extensions/pgvector/test/t/006_lists.pl deleted file mode 100644 index eeb11aa3ca71..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/006_lists.pl +++ /dev/null @@ -1,31 +0,0 @@ -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 3; - -# Initialize node -my $node = get_new_node('node'); -$node->init; -$node->start; - -# Create table -$node->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node->safe_psql("postgres", "CREATE TABLE tst (v vector(3));"); -$node->safe_psql("postgres", - "INSERT INTO tst SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" -); - -$node->safe_psql("postgres", "CREATE INDEX lists50 ON tst USING ivfflat (v) WITH (lists = 50);"); -$node->safe_psql("postgres", "CREATE INDEX lists100 ON tst USING ivfflat (v) WITH (lists = 100);"); - -# Test prefers more lists -my $res = $node->safe_psql("postgres", "EXPLAIN SELECT v FROM tst ORDER BY v <-> '[0.5,0.5,0.5]' LIMIT 10;"); -like($res, qr/lists100/); -unlike($res, qr/lists50/); - -# Test errors with too much memory -my ($ret, $stdout, $stderr) = $node->psql("postgres", - "CREATE INDEX lists10000 ON tst USING ivfflat (v) WITH (lists = 10000);" -); -like($stderr, qr/memory required is/); diff --git a/src/postgres/third-party-extensions/pgvector/test/t/007_inserts.pl b/src/postgres/third-party-extensions/pgvector/test/t/007_inserts.pl deleted file mode 100644 index 73d77b58e9b6..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/007_inserts.pl +++ /dev/null @@ -1,55 +0,0 @@ -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 7; - -my $dim = 768; - -my $array_sql = join(",", ('random()') x $dim); - -# Initialize node -my $node = get_new_node('node'); -$node->init; -$node->start; - -# Create table and index -$node->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node->safe_psql("postgres", "CREATE TABLE tst (v vector($dim));"); -$node->safe_psql("postgres", - "INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10000) i;" -); -$node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); - -$node->pgbench( - "--no-vacuum --client=5 --transactions=100", - 0, - [qr{actually processed}], - [qr{^$}], - "concurrent INSERTs", - { - "007_inserts" => "INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10) i;" - } -); - -sub idx_scan -{ - # Stats do not update instantaneously - # https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-STATS-VIEWS - sleep(1); - $node->safe_psql("postgres", "SELECT idx_scan FROM pg_stat_user_indexes WHERE indexrelid = 'tst_v_idx'::regclass;"); -} - -my $expected = 10000 + 5 * 100 * 10; - -my $count = $node->safe_psql("postgres", "SELECT COUNT(*) FROM tst;"); -is($count, $expected); -is(idx_scan(), 0); - -$count = $node->safe_psql("postgres", qq( - SET enable_seqscan = off; - SET ivfflat.probes = 100; - SELECT COUNT(*) FROM (SELECT v FROM tst ORDER BY v <-> (SELECT v FROM tst LIMIT 1)) t; -)); -is($count, $expected); -is(idx_scan(), 1); diff --git a/src/postgres/third-party-extensions/pgvector/test/t/008_avg.pl b/src/postgres/third-party-extensions/pgvector/test/t/008_avg.pl deleted file mode 100644 index f03df617df28..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/008_avg.pl +++ /dev/null @@ -1,35 +0,0 @@ -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 5; - -# Initialize node -my $node = get_new_node('node'); -$node->init; -$node->start; - -# Create table -$node->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node->safe_psql("postgres", "CREATE TABLE tst (r1 real, r2 real, r3 real, v vector(3));"); -$node->safe_psql("postgres", qq( - INSERT INTO tst SELECT r1, r2, r3, ARRAY[r1, r2, r3] FROM ( - SELECT random() + 1.01 AS r1, random() + 2.01 AS r2, random() + 3.01 AS r3 FROM generate_series(1, 1000000) t - ) i; -)); - -# Test avg -my $avg = $node->safe_psql("postgres", "SELECT AVG(v) FROM tst;"); -like($avg, qr/\[1\.5/); -like($avg, qr/,2\.5/); -like($avg, qr/,3\.5/); - -# Test matches real -my $r1 = $node->safe_psql("postgres", "SELECT AVG(r1)::float4 FROM tst;"); -my $r2 = $node->safe_psql("postgres", "SELECT AVG(r2)::float4 FROM tst;"); -my $r3 = $node->safe_psql("postgres", "SELECT AVG(r3)::float4 FROM tst;"); -is($avg, "[$r1,$r2,$r3]"); - -# Test explain -my $explain = $node->safe_psql("postgres", "EXPLAIN SELECT AVG(v) FROM tst;"); -like($explain, qr/Partial Aggregate/); diff --git a/src/postgres/third-party-extensions/pgvector/test/t/009_storage.pl b/src/postgres/third-party-extensions/pgvector/test/t/009_storage.pl deleted file mode 100644 index de818c78b46d..000000000000 --- a/src/postgres/third-party-extensions/pgvector/test/t/009_storage.pl +++ /dev/null @@ -1,32 +0,0 @@ -use strict; -use warnings; -use PostgresNode; -use TestLib; -use Test::More tests => 1; - -my $dim = 1024; - -# Initialize node -my $node = get_new_node('node'); -$node->init; -$node->start; - -# Create table -$node->safe_psql("postgres", "CREATE EXTENSION vector;"); -$node->safe_psql("postgres", "CREATE TABLE tst (v1 vector(1024), v2 vector(1024), v3 vector(1024));"); - -# Test insert succeeds -$node->safe_psql("postgres", - "INSERT INTO tst SELECT array_agg(n), array_agg(n), array_agg(n) FROM generate_series(1, $dim) n" -); - -# Change storage to PLAIN -$node->safe_psql("postgres", "ALTER TABLE tst ALTER COLUMN v1 SET STORAGE PLAIN"); -$node->safe_psql("postgres", "ALTER TABLE tst ALTER COLUMN v2 SET STORAGE PLAIN"); -$node->safe_psql("postgres", "ALTER TABLE tst ALTER COLUMN v3 SET STORAGE PLAIN"); - -# Test insert fails -my ($ret, $stdout, $stderr) = $node->psql("postgres", - "INSERT INTO tst SELECT array_agg(n), array_agg(n), array_agg(n) FROM generate_series(1, $dim) n" -); -like($stderr, qr/row is too big/); diff --git a/src/postgres/third-party-extensions/pgvector/test/yb_schedule b/src/postgres/third-party-extensions/pgvector/test/yb_schedule index 9eaeed501997..c8bf48223d65 100644 --- a/src/postgres/third-party-extensions/pgvector/test/yb_schedule +++ b/src/postgres/third-party-extensions/pgvector/test/yb_schedule @@ -6,8 +6,11 @@ # #################################################################################################### test: yb.orig.setup +test: bit test: cast -test: functions -test: input +test: yb.port.copy +test: halfvec +test: sparsevec +test: vector_type test: yb.orig.order_by test: yb.orig.index diff --git a/src/postgres/third-party-extensions/pgvector/vector.control b/src/postgres/third-party-extensions/pgvector/vector.control index 35412671c520..71f37019f4ec 100644 --- a/src/postgres/third-party-extensions/pgvector/vector.control +++ b/src/postgres/third-party-extensions/pgvector/vector.control @@ -1,4 +1,4 @@ -comment = 'vector data type and ivfflat access method' -default_version = '0.4.4-yb-1.2' +comment = 'vector data type and ybhnsw access method' +default_version = '0.8.0-yb-1.0' module_pathname = '$libdir/vector' relocatable = true