Merge remote-tracking branch 'upstream/main' into vfs

Ami11111 · Aug 15, 2024 · 4351ead · 4351ead
2 parents 6847932 + 86a880e
commit 4351ead
Show file tree

Hide file tree

Showing 31 changed files with 465 additions and 392 deletions.
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ Supports a wide range of data types including strings, numerics, vectors, and mo
 Infinity, also available as a Python module, eliminates the need for a separate back-end server and all the complex communication settings. Using `pip install` and `import infinity`, you can quickly build a local AI application in Python, leveraging the world's fastest and the most powerful RAG database:
 
    ```bash
-   pip install infinity-sdk==0.3.0.dev4
+   pip install infinity-sdk==0.3.0.dev5
    ```
 
    ```python

diff --git a/benchmark/remote_infinity/remote_query_benchmark.cpp b/benchmark/remote_infinity/remote_query_benchmark.cpp
@@ -51,7 +51,7 @@ struct InfinityClient {
         transport->open();
         CommonResponse response;
         ConnectRequest request;
-        request.__set_client_version(13); // 0.3.0.dev4
+        request.__set_client_version(14); // 0.3.0.dev5
         client->Connect(response, request);
         session_id = response.session_id;
     }

diff --git a/client/cpp/infinity_client.cpp b/client/cpp/infinity_client.cpp
@@ -25,7 +25,7 @@ Client Client::Connect(const std::string &ip_address, uint16_t port) {
     transport->open();
     CommonResponse response;
     ConnectRequest request;
-    request.__set_client_version(13); // 0.3.0.dev4
+    request.__set_client_version(14); // 0.3.0.dev5
     client->Connect(response, request);
     return {socket, transport, protocol, std::move(client), response.session_id};
 }

diff --git a/docs/getstarted/deploy_infinity_server.mdx b/docs/getstarted/deploy_infinity_server.mdx
@@ -30,7 +30,7 @@ This approach allows you to embed Infinity as a module in a Python application.
 ### Install Infinity as a module
 
 ```
-pip install infinity-sdk==0.3.0.dev4
+pip install infinity-sdk==0.3.0.dev5
 ```
 
 ### Create an Infinity object
@@ -98,7 +98,7 @@ If you are on Windows 10+, you must enable WSL or WSL2 to deploy Infinity using
 ### Install Infinity client
 
 ```
-pip install infinity-sdk==0.3.0.dev4
+pip install infinity-sdk==0.3.0.dev5
 ```
 
 ### Connect to Infinity Server 
@@ -140,7 +140,7 @@ This section provides instructions on deploying Infinity using binary package on
 
 Fedora/RHEL/CentOS/OpenSUSE
 ```bash
-sudo rpm -i infinity-0.3.0.dev4-x86_64.rpm
+sudo rpm -i infinity-0.3.0.dev5-x86_64.rpm
 ```
 
 ```bash
@@ -151,7 +151,7 @@ sudo systemctl start infinity
   <TabItem value="ubuntu">
 
 ```bash
-sudo dpkg -i infinity-0.3.0.dev4-x86_64.deb
+sudo dpkg -i infinity-0.3.0.dev5-x86_64.deb
 ```
 
 ```bash
@@ -172,7 +172,7 @@ sudo systemctl start infinity
 ### Install Infinity client
 
 ```
-pip install infinity-sdk==0.3.0.dev4
+pip install infinity-sdk==0.3.0.dev5
 ```
 
 ### Connect to Infinity Server 

diff --git a/docs/getstarted/quickstart.md b/docs/getstarted/quickstart.md
@@ -17,7 +17,7 @@ Infinity, also available as a Python module, eliminates the need for a separate
 
 1. Install Infinity as a module:
    ```bash
-   pip install infinity-sdk==0.3.0.dev4
+   pip install infinity-sdk==0.3.0.dev5
    ```
 2. Use Infinity to conduct a KNN search:
    ```python

diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md
@@ -709,7 +709,7 @@ table_object.create_index(
                 InitParameter("M", "16"),
                 InitParameter("ef_construction", "50"),
                 InitParameter("ef", "50"),
-                InitParameter("metric", "l2")
+                InitParameter("metric", "l2"),
                 InitParameter("encode", "lvq") # "lvq" applies to float vector element only
             ]
         ),
@@ -1449,7 +1449,7 @@ table_object.output(["*"]).filter("c2 = 3").to_pl()
 table_object.knn(vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params = None)
 ```
 
-Performs a k-nearest neighbor (KNN) or approximate nearest neighbor (ANN) vector search to identify the top n closest rows to the given vector. Suitable for working with dense vectors (dense embeddings).
+Performs a k-nearest neighbor (KNN) or approximate nearest neighbor (ANN) vector search to identify the top k closest rows to the given vector. Suitable for working with dense vectors (dense embeddings).
 
 ### Parameters
 
@@ -1540,7 +1540,7 @@ If the HNSW index is not created successfully, the search will fall back to a br
 ## match_sparse
 
 ```python
-table_object.match_sparse(vector_column_name, sparse_data, distance_type, topn, opt_params = None)
+table_object.match_sparse(vector_column_name, sparse_data, distance_type, topn, opt_params)
 ```
 
 Performs a sparse vector search to to identify the top n closest rows to the given sparse vector. Suitable for working with sparse vectors (sparse embeddings).
@@ -1662,9 +1662,9 @@ A non-empty text string to search for. You can use various search options within
 
 A non-empty string specifying the following search options:
 
-- `"topn"`: `str`, *Required*  
+- **"topn"**: `str`, *Required*  
   Specifies the number of the most relevant rows to retrieve, e.g., `"topn=10"` to obtain the ten most relevant rows.
-- `"operator"`: `str`, *Optional*
+- **"operator"**: `str`, *Optional*
   - If not specified, the search follows Infinity's full-text search syntax, meaning that logical and arithmetic operators and escape characters will function as full-text search operators, such as:
     - `&&`, `+`, `||`, `!`, `NOT`, `AND`, `OR` `-`, `(`, `)`, `~`, `^`, `:`, `"`.
     - Escape characters like `\`, `\t`, and more.
@@ -1703,41 +1703,44 @@ for question in questions:
 
 ---
 
-## match tensor
+## fusion
 
 ```python
-table_object.match_tensor(vector_column_name, tensor_data, tensor_data_type, method_type, topn, extra_option)
+table_object.fusion(method, options_text, commonMatchTensorExpr = None)
 ```
 
-Builds a KNN tensor search expression. Find the top n closest rows to the given tensor according to chosen method.
-
-For example, find k most match tensors generated by ColBERT.
+Builds a fusion expression.
 
 ### Parameters
 
-#### vector_column_name: `str`, *Required*
-
+#### method: `str`
 
-#### tensor_data: `list/np.ndarray`, *Required*
+Supported reranking methods for multi-way retrieval include:
 
+- `"rrf"`: [Reciprocal rank fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf)  
+  RRF scores all retrieved documents from each retrieval path using the reciprocals of their rankings. It then calculates the final score of each retrieved document by summing its scores from all retrieval paths. This technique is particularly useful when you are uncertain of the importance of each retrieval path.
+- `"weighted_sum"`  
+  The weighted sum approach assigns different weights to different retrieval paths, allowing you to emphasize specific paths. This is particularly useful when you are certain of each path's relative importance.  
+- `"match_tensor"`  
+  Infinity's tensor-based reranking approach. This is used for reranker dense vector, sparse vector, or full-text retrieval paths.  
 
-#### tensor_data_type: `str`, *Required*
+#### options_text: `str`, *Required
 
+A non-empty, semicolon-separated string specifying the following reranking options:
 
-#### method_type: `str`
+- **Common options**: `str`, *Required*  
+  Mandatory settings for the fused reranking.  
+  - `"topn"`: Specifies the number of the most relevant rows to retrieve, e.g., `"topn=10"` to obtain the ten most relevant rows.
 
--  `'maxsim'`
+- **RRF-specific options**: `str`, *Optional*  
+  Settings when employing RRF for reranking.  
+  - `"rank_constant"`: The smoothing constant for RRF reranking. Typically set to `60`, e.g., `"topn=10;rank_constant=60"`.
 
-#### extra_option: `str` 
+- **weighted_sum-specific options**: `str`, *Optional*
+  Settings when employing Weighted Sum for reranking.  
+  - `"weights"`: Specifies the weight for each retrieval path. For example, `"weights=1,2,0.5"` sets weights of `1`, `2`, and `0.5` for the first, second, and third retrieval paths, respectively. The default weight of each retrieval path is `1.0`. If `"weight"` is not specified, all retrieval paths will be assiged the default weight of `1.0`.
 
-Options seperated by ';'
-    - `'topn'`
-    - **EMVB index options**
-        - `'emvb_centroid_nprobe'`
-        - `'emvb_threshold_first'`
-        - `'emvb_n_doc_to_score'`
-        - `'emvb_n_doc_out_second_stage'`
-        - `'emvb_threshold_final'`
+#### commonMatchTensorExpr: `commonMatchTensorExpr()`, *Optional*
 
 ### Returns
 
@@ -1748,69 +1751,55 @@ Options seperated by ';'
 
 ### Examples
 
-```python
-match_tensor('t', [[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]], 'float', 'maxsim', 'topn=2')
-match_tensor('t', [[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]], 'float', 'maxsim', 'topn=10;emvb_centroid_nprobe=4;emvb_threshold_first=0.4;emvb_threshold_final=0.5')
-```
-
----
+The following code snippets illustrate the use of fused reranking in a four-way retrieval.
 
-## fusion
+#### Use RRF for reranking
 
-```python
-table_object.fusion(method, options_text = '')
+```python {6}
+table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"])
+            .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
+            .match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
+            .match("body", "blooms", "topn=10")
+            .filter("year < 2024")
+            .fusion("rrf", "topn=2")
+            .to_pl()
 ```
 
-Builds a fusion expression.
-
-### Parameters
-
-#### method: `str`
-
-The supported methods, including:
-
-- `"rrf"`
-- `"weighted_sum"`
-- `"match_tensor"`
-
-#### options_text: `str`
-
-- `Common options`:
-    - 'topn=10': Retrieve the 10 most relevant rows. The defualt value is `100`.
-
-- `rrf-specific options`:
-    - 'rank_constant=30': The default value is `60`.
-
-- `weighted_sum-specific options`:
-    - 'weights=1,2,0.5': The weights of children scorers. The default weight of each weight is `1.0`.
-
-### Returns
-
-- Success: Self `Table`
-- Failure: `Exception`
-
-### Examples
-
-:::caution IMPORTANT
-Ensure that you import the following when using `make_match_tensor_expr`:
-
-```python
-from infinity.remote_thrift.types import make_match_tensor_expr
+```python {6}
+table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"])
+            .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
+            .match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
+            .match("body", "blooms", "topn=10")
+            .filter("year < 2024")
+            .fusion("rrf", "topn=2;rank_constant=30")
+            .to_pl()
 ```
-:::
 
-```python
-table_object.fusion('rrf')
-table_object.fusion('rrf', 'topn=10')
-table_object.fusion('weighted_sum', 'weights=1,2,0.5')
-table_object.fusion('match_tensor', 'topn=2', make_match_tensor_expr('t', [[0.0, -10.0, 0.0, 0.7], [9.2, 45.6, -55.8, 3.5]], 'float', 'maxsim'))
-```
+#### Use Weighted Sum for reranking
 
-### Details
+```python {6}
+table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"])
+            .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
+            .match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
+            .match("body", "blooms", "topn=10")
+            .filter("year < 2024")
+            .fusion("weighted_sum", "topn=2;weights=1,2,0.5")
+            .to_pl()
+```
 
-`rrf`:  Reciprocal rank fusion method.
+#### Use tensor reranking
 
-[Reciprocal rank fusion (RRF)](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) is a method that combines multiple result sets with different relevance indicators into one result set. RRF does not requires tuning, and the different relevance indicators do not have to be related to each other to achieve high-quality results.
+```python {8}
+# You must import `CommonMatchTensorExpr`to set tensor reranking parameters
+from infinity.common import CommonMatchTensorExpr
+table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"])
+            .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
+            .match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
+            .match("body", "blooms", "topn=10")
+            .filter("year < 2024")
+            .fusion("match_tensor", "topn=2", commonMatchTensorExpr("tensor", [[0.0, -10.0, 0.0, 0.7], [9.2, 45.6, -55.8, 3.5]], "float", "maxsim"))
+            .to_pl()
+```
 
 ---
 
@@ -1891,7 +1880,7 @@ table_object.to_arrow()
 
 Returns the query result in Apache Arrow Table format.
 
-:::note
+:::tip NOTE
 Call `to_arrow()` in a chain after (not necessarily "immediately after") `output(columns)` on the same table object.
 :::
 

diff --git a/python/infinity/remote_thrift/client.py b/python/infinity/remote_thrift/client.py
@@ -60,7 +60,8 @@ def reconnect(self):
         # version: 0.3.0.dev1, client_version: 11
         # version: 0.3.0.dev3, client_version: 12
         # version: 0.3.0.dev4, client_version: 13
-        res = self.client.Connect(ConnectRequest(client_version=13))
+        # version: 0.3.0.dev5, client_version: 14
+        res = self.client.Connect(ConnectRequest(client_version=14))
         if res.error_code != 0:
             raise InfinityException(res.error_code, res.error_msg)
         self.session_id = res.session_id

diff --git a/src/embedded_infinity/wrap_infinity.cpp b/src/embedded_infinity/wrap_infinity.cpp
@@ -890,7 +890,7 @@ void HandleVarcharType(ColumnField &output_column_field, SizeT row_count, const
             std::memcpy(dst.data() + current_offset, &length, sizeof(i32));
             std::memcpy(dst.data() + current_offset + sizeof(i32), varchar.short_.data_, varchar.length_);
         } else {
-            const char *data = column_vector->buffer_->var_buffer_mgr_->Get(varchar.vector_.file_offset_, varchar.length_);
+            const char *data = column_vector->buffer_->GetVarchar(varchar.vector_.file_offset_, varchar.length_);
             std::memcpy(dst.data() + current_offset, &length, sizeof(i32));
             std::memcpy(dst.data() + current_offset + sizeof(i32), data, varchar.length_);
         }
@@ -991,12 +991,15 @@ void HandleSparseType(ColumnField &output_column_field, SizeT row_count, const S
     for (SizeT index = 0; index < row_count; ++index) {
         SparseT &sparse = reinterpret_cast<SparseT *>(column_vector->data())[index];
         i32 nnz = sparse.nnz_;
-        i32 length = sparse_info->SparseSize(nnz);
         std::memcpy(dst.data() + current_offset, &nnz, sizeof(i32));
         current_offset += sizeof(i32);
-        const auto raw_data_ptr = column_vector->buffer_->fix_heap_mgr_->GetRawPtrFromChunk(sparse.chunk_id_, sparse.chunk_offset_);
-        std::memcpy(dst.data() + current_offset, raw_data_ptr, length);
-        current_offset += length;
+        SizeT data_size = sparse_info->DataSize(sparse.nnz_);
+        SizeT idx_size = sparse_info->IndiceSize(sparse.nnz_);
+        auto [raw_data_ptr, raw_idx_ptr] = column_vector->buffer_->GetSparseRaw(sparse.file_offset_, sparse.nnz_, sparse_info);
+        std::memcpy(dst.data() + current_offset, raw_idx_ptr, idx_size);
+        current_offset += idx_size;
+        std::memcpy(dst.data() + current_offset, raw_data_ptr, data_size);
+        current_offset += data_size;
     }
 
     output_column_field.column_vectors.emplace_back(dst.c_str(), dst.size());

diff --git a/src/executor/operator/physical_import.cpp b/src/executor/operator/physical_import.cpp
@@ -904,7 +904,10 @@ SharedPtr<ConstantExpr> BuildConstantExprFromJson(const nlohmann::json &json_obj
 SharedPtr<ConstantExpr> BuildConstantSparseExprFromJson(const nlohmann::json &json_object, const SparseInfo *sparse_info) {
     SharedPtr<ConstantExpr> res = nullptr;
     switch (sparse_info->DataType()) {
-        case EmbeddingDataType::kElemBit:
+        case EmbeddingDataType::kElemBit: {
+            res = MakeShared<ConstantExpr>(LiteralType::kIntegerArray);
+            break;
+        }
         case EmbeddingDataType::kElemUInt8:
         case EmbeddingDataType::kElemInt8:
         case EmbeddingDataType::kElemInt16:
@@ -935,9 +938,9 @@ SharedPtr<ConstantExpr> BuildConstantSparseExprFromJson(const nlohmann::json &js
             switch (json_object[0].type()) {
                 case nlohmann::json::value_t::number_unsigned:
                 case nlohmann::json::value_t::number_integer: {
-                    res->long_sparse_array_.first.resize(array_size);
+                    res->long_array_.resize(array_size);
                     for (u32 i = 0; i < array_size; ++i) {
-                        res->long_sparse_array_.first[i] = json_object[i].get<i64>();
+                        res->long_array_[i] = json_object[i].get<i64>();
                     }
                     return res;
                 }

diff --git a/src/executor/operator/physical_scan/physical_match_sparse_scan.cpp b/src/executor/operator/physical_scan/physical_match_sparse_scan.cpp
@@ -370,14 +370,8 @@ void PhysicalMatchSparseScan::ExecuteInnerT(DistFunc *dist_func,
 
     auto get_ele = [](const ColumnVector &column_vector, SizeT idx) -> SparseVecRef<typename DistFunc::DataT, typename DistFunc::IndexT> {
         const auto *ele = reinterpret_cast<const SparseT *>(column_vector.data()) + idx;
-        const auto &[nnz, chunk_id, chunk_offset] = *ele;
-        if (nnz == 0) {
-            return SparseVecRef<typename DistFunc::DataT, typename DistFunc::IndexT>(0, nullptr, nullptr);
-        }
-        const char *sparse_ptr = column_vector.buffer_->fix_heap_mgr_->GetRawPtrFromChunk(chunk_id, chunk_offset);
-        const auto *indices = reinterpret_cast<const typename DistFunc::IndexT *>(sparse_ptr);
-        const auto *data = reinterpret_cast<const typename DistFunc::DataT *>(sparse_ptr + nnz * sizeof(typename DistFunc::IndexT));
-        return SparseVecRef<typename DistFunc::DataT, typename DistFunc::IndexT>(nnz, indices, data);
+        const auto &[nnz, file_offset] = *ele;
+        return column_vector.buffer_->template GetSparse<typename DistFunc::DataT, typename DistFunc::IndexT>(file_offset, nnz);
     };
 
     auto task_id = block_ids_idx;