feat: update merge_insert to add statistics for inserted, updated, deleted rows (lancedb#2357)

raunaks13 · web-flow · commit 83ecc0156a82 · 2024-05-22T14:57:59.000-07:00
Addresses lancedb#2019
diff --git a/benchmarks/dbpedia-openai/benchmarks.py b/benchmarks/dbpedia-openai/benchmarks.py
@@ -48,7 +48,8 @@ def ground_truth(
 
 def compute_recall(gt: np.ndarray, result: np.ndarray) -> float:
     recalls = [
-        np.isin(rst, gt_vector).sum() / rst.shape[0] for (rst, gt_vector) in zip(result, gt)
+        np.isin(rst, gt_vector).sum() / rst.shape[0]
+        for (rst, gt_vector) in zip(result, gt)
     ]
     return np.mean(recalls)
 
diff --git a/benchmarks/flat/benchmark.py b/benchmarks/flat/benchmark.py
@@ -17,7 +17,6 @@
 import time
 
 import lance
-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import pyarrow as pa
diff --git a/benchmarks/full_report/_lib.py b/benchmarks/full_report/_lib.py
@@ -6,9 +6,6 @@
 from typing import List
 
 import gzip
-import lance
-import numpy as np
-import pyarrow as pa
 import requests
 
 
@@ -33,15 +30,15 @@ def cosine(X, Y):
 def knn(
     query: np.ndarray,
     data: np.ndarray,
-    metric: Literal['L2', 'cosine'],
+    metric: Literal["L2", "cosine"],
     k: int,
 ) -> np.ndarray:
-    if metric == 'L2':
+    if metric == "L2":
         dist = l2
-    elif metric == 'cosine':
+    elif metric == "cosine":
         dist = cosine
     else:
-        raise ValueError('Invalid metric')
+        raise ValueError("Invalid metric")
     return np.argpartition(dist(query, data), k, axis=1)[:, 0:k]
 
 
@@ -51,10 +48,12 @@ def write_lance(
 ):
     dims = data.shape[1]
 
-    schema = pa.schema([
-        pa.field("vec", pa.list_(pa.float32(), dims)),
-        pa.field("id", pa.uint32(), False),
-    ])
+    schema = pa.schema(
+        [
+            pa.field("vec", pa.list_(pa.float32(), dims)),
+            pa.field("id", pa.uint32(), False),
+        ]
+    )
 
     fsl = pa.FixedSizeListArray.from_arrays(
         pa.array(data.reshape(-1).astype(np.float32), type=pa.float32()),
@@ -65,6 +64,7 @@ def write_lance(
 
     lance.write_dataset(t, path)
 
+
 # NYT
 
 _DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz"
@@ -112,7 +112,8 @@ def _get_nyt_vectors(
     tfidf = TfidfTransformer().fit_transform(freq)
     print("computing dense projection")
     dense_projection = random_projection.GaussianRandomProjection(
-        n_components=output_dims, random_state=42,
+        n_components=output_dims,
+        random_state=42,
     ).fit_transform(tfidf)
     dense_projection = dense_projection.astype(np.float32)
     np.save(_CACHE_PATH, dense_projection)
diff --git a/benchmarks/sift/index.py b/benchmarks/sift/index.py
@@ -20,7 +20,6 @@
 from subprocess import check_output
 
 import lance
-import pyarrow as pa
 
 
 def main():
diff --git a/benchmarks/tpch/benchmark.py b/benchmarks/tpch/benchmark.py
@@ -1,7 +1,5 @@
 # Benchmark performance Lance vs Parquet w/ Tpch Q1 and Q6
 import lance
-import pandas as pd
-import pyarrow as pa
 import duckdb
 
 import sys
@@ -46,10 +44,10 @@
 num_args = len(sys.argv)
 assert num_args == 2
 
-query = ''
-if sys.argv[1] == 'q1':
+query = ""
+if sys.argv[1] == "q1":
     query = Q1
-elif sys.argv[1] == 'q6':
+elif sys.argv[1] == "q6":
     query = Q6
 else:
     sys.exit("We only support Q1 and Q6 for now")
@@ -62,17 +60,18 @@
 res1 = duckdb.sql(query).df()
 end1 = time.time()
 
-print("Lance Latency: ",str(round(end1 - start1, 3)) + 's')
+print("Lance Latency: ", str(round(end1 - start1, 3)) + "s")
 print(res1)
 
 ##### Parquet #####
 lineitem = None
 start2 = time.time()
 # read from parquet and create a view instead of table from it
-duckdb.sql("CREATE VIEW lineitem AS SELECT * FROM read_parquet('./dataset/lineitem_sf1.parquet');")
+duckdb.sql(
+    "CREATE VIEW lineitem AS SELECT * FROM read_parquet('./dataset/lineitem_sf1.parquet');"
+)
 res2 = duckdb.sql(query).df()
 end2 = time.time()
 
-print("Parquet Latency: ",str(round(end2 - start2, 3)) + 's')
+print("Parquet Latency: ", str(round(end2 - start2, 3)) + "s")
 print(res2)
-
diff --git a/docs/conf.py b/docs/conf.py
@@ -1,7 +1,6 @@
 # Configuration file for the Sphinx documentation builder.
 
 import shutil
-import subprocess
 
 
 def run_apidoc(_):
diff --git a/docs/examples/gcs_example.py b/docs/examples/gcs_example.py
@@ -1,25 +1,29 @@
-# 
+#
 # Lance example loading a dataset from Google Cloud Storage
 #
 # You need to set one of the following environment variables in order to authenticate with GS
 #   - GOOGLE_SERVICE_ACCOUNT: location of service account file
 #   - GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key
 #
-# Follow this doc in order to create an service key: https://cloud.google.com/iam/docs/keys-create-delete 
+# Follow this doc in order to create an service key: https://cloud.google.com/iam/docs/keys-create-delete
 #
 
 import lance
+import pandas as pd
 
 ds = lance.dataset("gs://eto-public/datasets/oxford_pet/oxford_pet.lance")
 count = ds.count_rows()
 print(f"There are {count} pets")
 
 # You can also write to GCS
-import pandas as pd
+
 uri = "gs://eto-public/datasets/oxford_pet/example.lance"
-lance.write_dataset(pd.DataFrame({"a": pd.array([10], dtype="Int32")}), uri, mode='create')
+lance.write_dataset(
+    pd.DataFrame({"a": pd.array([10], dtype="Int32")}), uri, mode="create"
+)
 assert lance.dataset(uri).version == 1
 
-lance.write_dataset(pd.DataFrame({"a": pd.array([5], dtype="Int32")}), uri, mode='append')
+lance.write_dataset(
+    pd.DataFrame({"a": pd.array([5], dtype="Int32")}), uri, mode="append"
+)
 assert lance.dataset(uri).version == 2
-
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -83,7 +83,9 @@ class MergeInsertBuilder(_MergeInsertBuilder):
     def execute(self, data_obj: ReaderLike, *, schema: Optional[pa.Schema] = None):
         """Executes the merge insert operation
 
-        There is no return value but the original dataset will be updated.
+        This function updates the original dataset and returns a dictionary with
+        information about merge statistics - i.e. the number of inserted, updated,
+        and deleted rows.
 
         Parameters
         ----------
@@ -97,7 +99,8 @@ def execute(self, data_obj: ReaderLike, *, schema: Optional[pa.Schema] = None):
             source is some kind of generator.
         """
         reader = _coerce_reader(data_obj, schema)
-        super(MergeInsertBuilder, self).execute(reader)
+
+        return super(MergeInsertBuilder, self).execute(reader)
 
     # These next three overrides exist only to document the methods
 
@@ -945,10 +948,11 @@ def merge_insert(
         >>> dataset = lance.write_dataset(table, "example")
         >>> new_table = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
         >>> # Perform a "upsert" operation
-        >>> dataset.merge_insert("a")             \\
-        ...        .when_matched_update_all()     \\
-        ...        .when_not_matched_insert_all() \\
-        ...        .execute(new_table)
+        >>> dataset.merge_insert("a")     \\
+        ...             .when_matched_update_all()     \\
+        ...             .when_not_matched_insert_all() \\
+        ...             .execute(new_table)
+        {'num_inserted_rows': 1, 'num_updated_rows': 2, 'num_deleted_rows': 0}
         >>> dataset.to_table().sort_by("a").to_pandas()
            a  b
         0  1  b
diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs
diff --git a/test_data/v0.10.5/datagen.py b/test_data/v0.10.5/datagen.py

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,8 @@ def ground_truth(`
`48`	`48`
`49`	`49`	`def compute_recall(gt: np.ndarray, result: np.ndarray) -> float:`
`50`	`50`	`recalls = [`
`51`		`- np.isin(rst, gt_vector).sum() / rst.shape[0] for (rst, gt_vector) in zip(result, gt)`
	`51`	`+ np.isin(rst, gt_vector).sum() / rst.shape[0]`
	`52`	`+ for (rst, gt_vector) in zip(result, gt)`
`52`	`53`	`]`
`53`	`54`	`return np.mean(recalls)`
`54`	`55`