From e02e019cca40db7f25bb74458bb3fc8d921edff3 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Tue, 23 Jan 2024 19:03:14 -0500 Subject: [PATCH 01/18] Implemented AnnoyIndex serialization --- src/annoylib.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/annoylib.h b/src/annoylib.h index 657977cb..9a35fb44 100644 --- a/src/annoylib.h +++ b/src/annoylib.h @@ -912,6 +912,7 @@ class AnnoyIndexInterface { virtual bool save(const char* filename, bool prefault=false, char** error=NULL) = 0; virtual void unload() = 0; virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0; + virtual vector serialize(char** error=NULL) = 0; virtual T get_distance(S i, S j) const = 0; virtual void get_nns_by_item(S item, size_t n, int search_k, vector* result, vector* distances) const = 0; virtual void get_nns_by_vector(const T* w, size_t n, int search_k, vector* result, vector* distances) const = 0; @@ -1221,6 +1222,15 @@ template serialize(char** error=NULL) { + if (!_built) { + set_error_from_string(error, "Index cannot be serialized if it hasn't been built"); + return {}; + } + + return vector(_nodes, _nodes + _n_nodes * _s); + } + T get_distance(S i, S j) const { return D::normalized_distance(D::distance(_get(i), _get(j), _f)); } From 813144dbdd53896d55d63ff1843893fede0762b7 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Wed, 24 Jan 2024 08:07:15 -0500 Subject: [PATCH 02/18] Implemented AnnoyIndex deserialization --- src/annoylib.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/annoylib.h b/src/annoylib.h index 9a35fb44..b2fbf4ca 100644 --- a/src/annoylib.h +++ b/src/annoylib.h @@ -913,6 +913,7 @@ class AnnoyIndexInterface { virtual void unload() = 0; virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0; virtual vector serialize(char** error=NULL) = 0; + virtual bool deserialize(const vector bytes, bool prefault=false, char** error=NULL) = 0; virtual T get_distance(S i, S j) const = 0; virtual void get_nns_by_item(S item, size_t n, int search_k, vector* result, vector* distances) const = 0; virtual void get_nns_by_vector(const T* w, size_t n, int search_k, vector* result, vector* distances) const = 0; @@ -1231,6 +1232,53 @@ template(_nodes, _nodes + _n_nodes * _s); } + bool deserialize(const vector bytes, bool prefault=false, char** error=NULL) { + if (bytes.size() == 0) { + set_error_from_errno(error, "Size of bytes is zero"); + return false; + } + + if (bytes.size() % _s) { + // Something is fishy with this index! + set_error_from_errno(error, "Index size is not a multiple of vector size. Ensure you are opening using the same metric you used to create the index."); + return false; + } + + int flags = MAP_SHARED; + if (prefault) { +#ifdef MAP_POPULATE + flags |= MAP_POPULATE; +#else + annoylib_showUpdate("prefault is set to true, but MAP_POPULATE is not defined on this platform"); +#endif + } + + _nodes = (Node*)bytes.data(); + _n_nodes = (S)(bytes.size() / _s); + + _roots.clear(); + S m = -1; + + for (S i = _n_nodes - 1; i >= 0; i--) { + S k = _get(i)->n_descendants; + if (m == -1 || k == m) { + _roots.push_back(i); + m = k; + } else { + break; + } + } + + // hacky fix: since the last root precedes the copy of all roots, delete it + if (_roots.size() > 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0]) + _roots.pop_back(); + _loaded = true; + _built = true; + _n_items = m; + if (_verbose) annoylib_showUpdate("found %zu roots with degree %d\n", _roots.size(), m); + return true; + } + T get_distance(S i, S j) const { return D::normalized_distance(D::distance(_get(i), _get(j), _f)); } From 83641f2c7821f14b560c2f10c6c77ef4083bd5a2 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Wed, 24 Jan 2024 08:12:56 -0500 Subject: [PATCH 03/18] Added CPython function headers for serialize and deserialize --- annoy/__init__.pyi | 2 ++ src/annoylib.h | 12 ++++++------ src/annoymodule.cc | 3 ++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/annoy/__init__.pyi b/annoy/__init__.pyi index 08adf4b9..c30d21aa 100644 --- a/annoy/__init__.pyi +++ b/annoy/__init__.pyi @@ -10,6 +10,8 @@ class AnnoyIndex: def __init__(self, f: int, metric: Literal["angular", "euclidean", "manhattan", "hamming", "dot"]) -> None: ... def load(self, fn: str, prefault: bool = ...) -> Literal[True]: ... def save(self, fn: str, prefault: bool = ...) -> Literal[True]: ... + def serialize(self) -> bytes: ... + def deserialize(self, data: bytes, prefault: bool = ...) -> Literal[True]: ... @overload def get_nns_by_item(self, i: int, n: int, search_k: int = ..., include_distances: Literal[False] = ...) -> list[int]: ... @overload diff --git a/src/annoylib.h b/src/annoylib.h index b2fbf4ca..653d4e90 100644 --- a/src/annoylib.h +++ b/src/annoylib.h @@ -913,7 +913,7 @@ class AnnoyIndexInterface { virtual void unload() = 0; virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0; virtual vector serialize(char** error=NULL) = 0; - virtual bool deserialize(const vector bytes, bool prefault=false, char** error=NULL) = 0; + virtual bool deserialize(vector* bytes, bool prefault=false, char** error=NULL) = 0; virtual T get_distance(S i, S j) const = 0; virtual void get_nns_by_item(S item, size_t n, int search_k, vector* result, vector* distances) const = 0; virtual void get_nns_by_vector(const T* w, size_t n, int search_k, vector* result, vector* distances) const = 0; @@ -1232,13 +1232,13 @@ template(_nodes, _nodes + _n_nodes * _s); } - bool deserialize(const vector bytes, bool prefault=false, char** error=NULL) { - if (bytes.size() == 0) { + bool deserialize(vector* bytes, bool prefault=false, char** error=NULL) { + if (bytes->empty()) { set_error_from_errno(error, "Size of bytes is zero"); return false; } - if (bytes.size() % _s) { + if (bytes->size() % _s) { // Something is fishy with this index! set_error_from_errno(error, "Index size is not a multiple of vector size. Ensure you are opening using the same metric you used to create the index."); return false; @@ -1253,8 +1253,8 @@ templatedata(); + _n_nodes = (S)(bytes->size() / _s); _roots.clear(); S m = -1; diff --git a/src/annoymodule.cc b/src/annoymodule.cc index 6bb0ae1b..a978f237 100644 --- a/src/annoymodule.cc +++ b/src/annoymodule.cc @@ -96,6 +96,8 @@ class HammingWrapper : public AnnoyIndexInterface { bool save(const char* filename, bool prefault, char** error) { return _index.save(filename, prefault, error); }; void unload() { _index.unload(); }; bool load(const char* filename, bool prefault, char** error) { return _index.load(filename, prefault, error); }; + vector serialize(char** error) { return _index.serialize(error); }; + bool deserialize(vector* bytes, bool prefault, char** error) { return _index.deserialize(bytes, prefault, error); }; float get_distance(int32_t i, int32_t j) const { return _index.get_distance(i, j); }; void get_nns_by_item(int32_t item, size_t n, int search_k, vector* result, vector* distances) const { if (distances) { @@ -235,7 +237,6 @@ py_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) { Py_RETURN_TRUE; } - PyObject* get_nns_to_python(const vector& result, const vector& distances, int include_distances) { PyObject* l = NULL; From c9d2d18eb6715035376001bdbcf93cac54d68060 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Wed, 24 Jan 2024 13:27:03 -0500 Subject: [PATCH 04/18] Implemented AnnoyIndex python c extensions for serialization and deserialization --- src/annoymodule.cc | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/annoymodule.cc b/src/annoymodule.cc index a978f237..79f388f9 100644 --- a/src/annoymodule.cc +++ b/src/annoymodule.cc @@ -16,6 +16,7 @@ #include "kissrandom.h" #include "Python.h" #include "structmember.h" +#include "bytesobject.h" #include #if defined(_MSC_VER) && _MSC_VER == 1500 typedef signed __int32 int32_t; @@ -237,6 +238,45 @@ py_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) { Py_RETURN_TRUE; } +static PyObject * +py_an_serialize(py_annoy *self, PyObject *args, PyObject *kwargs) { + bool prefault = false; + if (!self->ptr) + return NULL; + + vector bytes = self->ptr->serialize(NULL); + + return PyBytes_FromStringAndSize((const char*)bytes.data(), bytes.size()); +} + +static PyObject * +py_an_deserialize(py_annoy *self, PyObject *args, PyObject *kwargs) { + PyObject* bytes; + bool prefault = false; + if (!self->ptr) + return NULL; + + static char const * kwlist[] = {"bytes", "prefault", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|b", (char**)kwlist, &bytes, &prefault)) + return NULL; + + if (!PyBytes_Check(bytes)) { + PyErr_SetString(PyExc_TypeError, "Expected bytes"); + return NULL; + } + + vector v(PyBytes_Size(bytes)); + memcpy(v.data(), PyBytes_AsString(bytes), v.size()); + + char* error; + if (!self->ptr->deserialize(&v, prefault, &error)) { + PyErr_SetString(PyExc_IOError, error); + free(error); + return NULL; + } + Py_RETURN_TRUE; +} + PyObject* get_nns_to_python(const vector& result, const vector& distances, int include_distances) { PyObject* l = NULL; @@ -576,6 +616,8 @@ py_an_set_seed(py_annoy *self, PyObject *args) { static PyMethodDef AnnoyMethods[] = { {"load", (PyCFunction)py_an_load, METH_VARARGS | METH_KEYWORDS, "Loads (mmaps) an index from disk."}, {"save", (PyCFunction)py_an_save, METH_VARARGS | METH_KEYWORDS, "Saves the index to disk."}, + {"serialize", (PyCFunction)py_an_serialize, METH_VARARGS | METH_KEYWORDS, "Serializes the index to bytes."}, + {"deserialize", (PyCFunction)py_an_deserialize, METH_VARARGS | METH_KEYWORDS, "Deserializes the index from bytes."}, {"get_nns_by_item",(PyCFunction)py_an_get_nns_by_item, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to item `i`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."}, {"get_nns_by_vector",(PyCFunction)py_an_get_nns_by_vector, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to vector `vector`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."}, {"get_item_vector",(PyCFunction)py_an_get_item_vector, METH_VARARGS, "Returns the vector for item `i` that was previously added."}, From f0a2dc237100722eb3fd14f7c1a737c37ba92f8d Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Wed, 24 Jan 2024 14:49:35 -0500 Subject: [PATCH 05/18] Fixed vector construction compile error --- src/annoylib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annoylib.h b/src/annoylib.h index 653d4e90..08421d03 100644 --- a/src/annoylib.h +++ b/src/annoylib.h @@ -1229,7 +1229,7 @@ template(_nodes, _nodes + _n_nodes * _s); + return vector((uint8_t*)_nodes, (uint8_t*)_nodes + _n_nodes * _s); } bool deserialize(vector* bytes, bool prefault=false, char** error=NULL) { From 7121f5c7bede08464d4d41651d75d26d51d36b55 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Wed, 24 Jan 2024 14:50:18 -0500 Subject: [PATCH 06/18] Added '.eggs/' directory to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3e54ba1a..a1c3e257 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.egg-info/ *.egg/ +*.eggs/ *.so *.o build/ From f22e3f7b4910ef0b9bbce8b0fdc6d66e61bc1aaf Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Wed, 24 Jan 2024 17:44:25 -0500 Subject: [PATCH 07/18] Fix deserialization --- src/annoylib.h | 6 +++++- src/annoymodule.cc | 21 ++++++++++++++------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/annoylib.h b/src/annoylib.h index 08421d03..ff6cf653 100644 --- a/src/annoylib.h +++ b/src/annoylib.h @@ -1253,7 +1253,10 @@ templatedata(); + _allocate_size((S)(bytes->size() / _s)); + + memcpy(_nodes, bytes->data(), bytes->size()); + _n_nodes = (S)(bytes->size() / _s); _roots.clear(); @@ -1272,6 +1275,7 @@ template 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0]) _roots.pop_back(); + _loaded = true; _built = true; _n_items = m; diff --git a/src/annoymodule.cc b/src/annoymodule.cc index 79f388f9..b9850198 100644 --- a/src/annoymodule.cc +++ b/src/annoymodule.cc @@ -240,7 +240,6 @@ py_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) { static PyObject * py_an_serialize(py_annoy *self, PyObject *args, PyObject *kwargs) { - bool prefault = false; if (!self->ptr) return NULL; @@ -251,29 +250,37 @@ py_an_serialize(py_annoy *self, PyObject *args, PyObject *kwargs) { static PyObject * py_an_deserialize(py_annoy *self, PyObject *args, PyObject *kwargs) { - PyObject* bytes; + PyObject* bytes_object; + char *error; bool prefault = false; if (!self->ptr) return NULL; static char const * kwlist[] = {"bytes", "prefault", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|b", (char**)kwlist, &bytes, &prefault)) + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "S|b", (char**)kwlist, &bytes_object, &prefault)) + return NULL; + + if (bytes_object == NULL) { + PyErr_SetString(PyExc_TypeError, "Expected bytes"); return NULL; + } - if (!PyBytes_Check(bytes)) { + if (!PyBytes_Check(bytes_object)) { PyErr_SetString(PyExc_TypeError, "Expected bytes"); return NULL; } - vector v(PyBytes_Size(bytes)); - memcpy(v.data(), PyBytes_AsString(bytes), v.size()); + Py_ssize_t length = PyBytes_Size(bytes_object); + uint8_t* raw_bytes = (uint8_t*)PyBytes_AsString(bytes_object); + vector v(raw_bytes, raw_bytes + length); - char* error; if (!self->ptr->deserialize(&v, prefault, &error)) { PyErr_SetString(PyExc_IOError, error); free(error); return NULL; } + Py_RETURN_TRUE; } From 4661f319e5d1424b8d649a50a626c41283a0fb63 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Wed, 24 Jan 2024 17:50:30 -0500 Subject: [PATCH 08/18] Added serialization test --- test/serialize_test.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 test/serialize_test.py diff --git a/test/serialize_test.py b/test/serialize_test.py new file mode 100644 index 00000000..ad799d95 --- /dev/null +++ b/test/serialize_test.py @@ -0,0 +1,41 @@ +import random + +from annoy import AnnoyIndex + +def test_serialize_index(): + f = 32 + + index = AnnoyIndex(f, 'angular') + + for iteration in range(1000): + vector = [random.gauss(0, 1) for z in range(f)] + index.add_item(iteration, vector) + + index.build(10) + + _ = index.serialize() + + +def test_deserialize_index(): + f = 32 + + index = AnnoyIndex(f, 'angular') + + for iteration in range(1000): + vector = [random.gauss(0, 1) for z in range(f)] + index.add_item(iteration, vector) + + index.build(10) + + data = index.serialize() + + index2 = AnnoyIndex(f, 'angular') + + index2.deserialize(data) + + index_item_count = index.get_n_items() + + assert index_item_count == index2.get_n_items() + assert index.get_n_trees() == index2.get_n_trees() + assert index.get_nns_by_item(0, index_item_count) == index2.get_nns_by_item(0, index_item_count) + From 915d90b44599059157e4b3ae7836d89655e158a2 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Wed, 24 Jan 2024 18:14:36 -0500 Subject: [PATCH 09/18] Added go module code --- src/annoygomodule.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/annoygomodule.h b/src/annoygomodule.h index 074cc635..a4cad929 100644 --- a/src/annoygomodule.h +++ b/src/annoygomodule.h @@ -36,6 +36,15 @@ class AnnoyIndex { bool load(const char* filename) { return ptr->load(filename, true); }; + vector serialize() { + return ptr->serialize(); + } + bool deserialize(vector* v, bool prefault) { + return ptr->deserialize(v, prefault); + } + bool deserialize(vector* v) { + return ptr->deserialize(v, true); + } float getDistance(int i, int j) { return ptr->get_distance(i, j); }; From f8c6c059c25fb084997940bc25d94db304f83448 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Thu, 25 Jan 2024 08:39:10 -0500 Subject: [PATCH 10/18] Added test for go bindings serialization --- test/annoy_test.go | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/test/annoy_test.go b/test/annoy_test.go index bd0e569d..44505c6c 100644 --- a/test/annoy_test.go +++ b/test/annoy_test.go @@ -104,6 +104,51 @@ func (suite *AnnoyTestSuite) TestFileHandling() { os.Remove("go_test3.ann") } +func (suite *AnnoyTestSuite) TestSerialization() { + index := annoyindex.NewAnnoyIndexAngular(3) + index.AddItem(0, []float32{0, 0, 1}) + index.AddItem(1, []float32{0, 1, 0}) + index.AddItem(2, []float32{1, 0, 0}) + index.Build(10) + + bytes := index.Serialize() + + index2 := annoyindex.NewAnnoyIndexAngular(3) + + success := index2.Deserialize(bytes) + + if !success { + assert.Fail(suite.T(), "Failed to deserialize") + } + + itemCountIsSame := index.GetNItems() == index2.GetNItems() + + if !itemCountIsSame { + assert.Fail(suite.T(), "Item count is not the same") + } + + var resultIndex []int + var resultIndex2 []int + + itemCount := index.GetNItems() + + index.GetNnsByItem(0, itemCount, -1, &resultIndex) + index2.GetNnsByItem(0, itemCount, -1, &resultIndex2) + + itemsAreSame := true + + for index := 0; index < itemCount; index++ { + if resultIndex[index] != resultIndex2[index] { + itemsAreSame = false + break + } + } + + if !itemsAreSame { + assert.Fail(suite.T(), "Items are not the same") + } +} + func (suite *AnnoyTestSuite) TestOnDiskBuild() { index := annoyindex.NewAnnoyIndexAngular(3) index.OnDiskBuild("go_test.ann"); From b8a939f9c34fbc1279ac3fd41ecbeb21ecf84bf3 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Thu, 25 Jan 2024 09:10:39 -0500 Subject: [PATCH 11/18] Implemented lua bindings for serialize and deserialize --- src/annoyluamodule.cc | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/annoyluamodule.cc b/src/annoyluamodule.cc index a005df11..041f715c 100644 --- a/src/annoyluamodule.cc +++ b/src/annoyluamodule.cc @@ -164,6 +164,27 @@ class LuaAnnoy { return 1; } + static int serialize(lua_State* L) { + Impl* self = getAnnoy(L, 1); + int nargs = lua_gettop(L); + vector bytes = self->serialize(); + + lua_pushlstring(L, (const char*) bytes.data(), bytes.size()); + + return 1; + } + + static int deserialize(lua_State* L) { + Impl* self = getAnnoy(L, 1); + int nargs = lua_gettop(L); + const char* bytes_buffer = luaL_checkstring(L, 2); + size_t bytes_buffer_size = strlen(bytes_buffer); + vector bytes(bytes_buffer, bytes_buffer + bytes_buffer_size); + self->deserialize(bytes); + + return 1; + } + static int unload(lua_State* L) { Impl* self = getAnnoy(L, 1); self->unload(); From d81b569968a0e20e9bcbededb8d62f46beab71cc Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Thu, 25 Jan 2024 17:39:37 -0500 Subject: [PATCH 12/18] Fixed deserialize functionality and added functions to table --- src/annoyluamodule.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/annoyluamodule.cc b/src/annoyluamodule.cc index 041f715c..c638f4d3 100644 --- a/src/annoyluamodule.cc +++ b/src/annoyluamodule.cc @@ -177,10 +177,10 @@ class LuaAnnoy { static int deserialize(lua_State* L) { Impl* self = getAnnoy(L, 1); int nargs = lua_gettop(L); - const char* bytes_buffer = luaL_checkstring(L, 2); - size_t bytes_buffer_size = strlen(bytes_buffer); + const char* bytes_buffer = lua_tostring(L, 2); + size_t bytes_buffer_size = lua_rawlen(L, 2); vector bytes(bytes_buffer, bytes_buffer + bytes_buffer_size); - self->deserialize(bytes); + self->deserialize(&bytes); return 1; } @@ -281,6 +281,8 @@ class LuaAnnoy { {"build", &ThisClass::build}, {"save", &ThisClass::save}, {"load", &ThisClass::load}, + {"serialize", &ThisClass::serialize}, + {"deserialize", &ThisClass::deserialize}, {"unload", &ThisClass::unload}, {"get_nns_by_item", &ThisClass::get_nns_by_item}, {"get_nns_by_vector", &ThisClass::get_nns_by_vector}, From f8e99d75f8d9aef3696abfde79067a2c619ea49d Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Thu, 25 Jan 2024 17:40:51 -0500 Subject: [PATCH 13/18] Added lua test 'serialize_deserialize' --- test/annoy_test.lua | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/annoy_test.lua b/test/annoy_test.lua index 5e8d2e02..ef922bbd 100644 --- a/test/annoy_test.lua +++ b/test/annoy_test.lua @@ -496,6 +496,30 @@ describe("index test", function() assert.same(u, y) end) + it("serialize_deserialize", function() + local f = 2 + local i = AnnoyIndex(f, 'euclidean') + i:add_item(0, {2, 2}) + i:add_item(1, {3, 2}) + i:add_item(2, {3, 3}) + i:add_item(3, {4, 4}) + i:add_item(4, {5, 5}) + i:build(10) + + local bytes = i:serialize() + + local j = AnnoyIndex(f, 'euclidean') + + j:deserialize(bytes) + + local item_count = 4 + + local first_items = i:get_nns_by_item(0, item_count) + local second_items = j:get_nns_by_item(0, item_count) + + assert.same(first_items, second_items) + end) + it("on_disk_build", function() local f = 2 local i = AnnoyIndex(f, 'euclidean') From f7df982b1031cb0465042db6959ad773aee0a165 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Sun, 28 Jan 2024 11:03:33 -0500 Subject: [PATCH 14/18] Added test for serialization on index mmaped from file --- test/serialize_test.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/test/serialize_test.py b/test/serialize_test.py index ad799d95..e1e8d1a6 100644 --- a/test/serialize_test.py +++ b/test/serialize_test.py @@ -8,7 +8,7 @@ def test_serialize_index(): index = AnnoyIndex(f, 'angular') for iteration in range(1000): - vector = [random.gauss(0, 1) for z in range(f)] + vector = [random.gauss(0, 1) for _ in range(f)] index.add_item(iteration, vector) index.build(10) @@ -22,7 +22,7 @@ def test_deserialize_index(): index = AnnoyIndex(f, 'angular') for iteration in range(1000): - vector = [random.gauss(0, 1) for z in range(f)] + vector = [random.gauss(0, 1) for _ in range(f)] index.add_item(iteration, vector) index.build(10) @@ -39,3 +39,22 @@ def test_deserialize_index(): assert index.get_n_trees() == index2.get_n_trees() assert index.get_nns_by_item(0, index_item_count) == index2.get_nns_by_item(0, index_item_count) +def test_serialize_after_load(): + f = 32 + + index1 = AnnoyIndex(f, 'angular') + + for iteration in range(1000): + vector = [random.gauss(0, 1) for _ in range(f)] + index1.add_item(iteration, vector) + + index1.build(10) + + save_path = "test/test.tree" + index1.save(save_path) + + index2 = AnnoyIndex(f, 'angular') + index2.load(save_path) + + assert index1.serialize() == index2.serialize() + From 9ead484a0d2e439471a36270bfc68ac835945d23 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Sun, 28 Jan 2024 17:11:55 -0500 Subject: [PATCH 15/18] Implemented root caching to replace computation of roots during deserialization --- src/annoylib.h | 62 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/src/annoylib.h b/src/annoylib.h index ff6cf653..1c63d182 100644 --- a/src/annoylib.h +++ b/src/annoylib.h @@ -1229,7 +1229,25 @@ template((uint8_t*)_nodes, (uint8_t*)_nodes + _n_nodes * _s); + vector bytes {}; + + S n_items = _n_items; + S n_nodes = _n_nodes; + size_t roots_size = _roots.size(); + S nodes_size = _nodes_size; + + bytes.insert(bytes.end(), (uint8_t*)&n_items, (uint8_t*)&n_items + sizeof(n_items)); + bytes.insert(bytes.end(), (uint8_t*)&n_nodes, (uint8_t*)&n_nodes + sizeof(n_nodes)); + bytes.insert(bytes.end(), (uint8_t*)&roots_size, (uint8_t*)&roots_size + sizeof(roots_size)); + bytes.insert(bytes.end(), (uint8_t*)&nodes_size, (uint8_t*)&nodes_size + sizeof(nodes_size)); + + uint8_t* roots_buffer = (uint8_t*)_roots.data(); + bytes.insert(bytes.end(), roots_buffer, roots_buffer + _roots.size() * sizeof(S)); + + uint8_t* nodes_buffer = (uint8_t*)_nodes; + bytes.insert(bytes.end(), nodes_buffer, nodes_buffer + _n_nodes * _s); + + return bytes; } bool deserialize(vector* bytes, bool prefault=false, char** error=NULL) { @@ -1238,12 +1256,6 @@ templatesize() % _s) { - // Something is fishy with this index! - set_error_from_errno(error, "Index size is not a multiple of vector size. Ensure you are opening using the same metric you used to create the index."); - return false; - } - int flags = MAP_SHARED; if (prefault) { #ifdef MAP_POPULATE @@ -1253,33 +1265,33 @@ templatesize() / _s)); + uint8_t* bytes_buffer = (uint8_t*)bytes->data(); + + _n_items = *(S*)bytes_buffer; + bytes_buffer += sizeof(S); - memcpy(_nodes, bytes->data(), bytes->size()); + _n_nodes = *(S*)bytes_buffer; + bytes_buffer += sizeof(S); - _n_nodes = (S)(bytes->size() / _s); + size_t roots_size = *(size_t*)bytes_buffer; + bytes_buffer += sizeof(size_t); + + _nodes_size = *(S*)bytes_buffer; + bytes_buffer += sizeof(S); _roots.clear(); - S m = -1; + _roots.resize(roots_size); + _roots.assign((S*) bytes_buffer, (S*) bytes_buffer + roots_size); + bytes_buffer += roots_size * sizeof(S); - for (S i = _n_nodes - 1; i >= 0; i--) { - S k = _get(i)->n_descendants; - if (m == -1 || k == m) { - _roots.push_back(i); - m = k; - } else { - break; - } - } + _allocate_size(_n_nodes * _s); - // hacky fix: since the last root precedes the copy of all roots, delete it - if (_roots.size() > 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0]) - _roots.pop_back(); + memcpy(_nodes, bytes_buffer, _n_nodes * _s); _loaded = true; _built = true; - _n_items = m; - if (_verbose) annoylib_showUpdate("found %zu roots with degree %d\n", _roots.size(), m); + + if (_verbose) annoylib_showUpdate("found %zu roots with degree %d\n", _roots.size(), _n_items); return true; } From ab49363a719f57ada6966aacb575594ea54acc2f Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Sun, 28 Jan 2024 17:26:04 -0500 Subject: [PATCH 16/18] Added extra checks to serialize test --- test/serialize_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/serialize_test.py b/test/serialize_test.py index e1e8d1a6..3742d524 100644 --- a/test/serialize_test.py +++ b/test/serialize_test.py @@ -57,4 +57,7 @@ def test_serialize_after_load(): index2.load(save_path) assert index1.serialize() == index2.serialize() + assert index1.get_n_items() == index2.get_n_items() + assert index1.get_n_trees() == index2.get_n_trees() + assert index1.get_nns_by_item(0, index1.get_n_items()) == index2.get_nns_by_item(0, index1.get_n_items()) From 7d6118c686922538545cb7b776ce1d4e25294262 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Mon, 29 Jan 2024 10:32:44 -0500 Subject: [PATCH 17/18] Made serialize method 'const' --- src/annoylib.h | 4 ++-- src/annoymodule.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/annoylib.h b/src/annoylib.h index 1c63d182..211717af 100644 --- a/src/annoylib.h +++ b/src/annoylib.h @@ -912,7 +912,7 @@ class AnnoyIndexInterface { virtual bool save(const char* filename, bool prefault=false, char** error=NULL) = 0; virtual void unload() = 0; virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0; - virtual vector serialize(char** error=NULL) = 0; + virtual vector serialize(char** error=NULL) const = 0; virtual bool deserialize(vector* bytes, bool prefault=false, char** error=NULL) = 0; virtual T get_distance(S i, S j) const = 0; virtual void get_nns_by_item(S item, size_t n, int search_k, vector* result, vector* distances) const = 0; @@ -1223,7 +1223,7 @@ template serialize(char** error=NULL) { + vector serialize(char** error=NULL) const { if (!_built) { set_error_from_string(error, "Index cannot be serialized if it hasn't been built"); return {}; diff --git a/src/annoymodule.cc b/src/annoymodule.cc index b9850198..7705ed7b 100644 --- a/src/annoymodule.cc +++ b/src/annoymodule.cc @@ -97,7 +97,7 @@ class HammingWrapper : public AnnoyIndexInterface { bool save(const char* filename, bool prefault, char** error) { return _index.save(filename, prefault, error); }; void unload() { _index.unload(); }; bool load(const char* filename, bool prefault, char** error) { return _index.load(filename, prefault, error); }; - vector serialize(char** error) { return _index.serialize(error); }; + vector serialize(char** error) const { return _index.serialize(error); }; bool deserialize(vector* bytes, bool prefault, char** error) { return _index.deserialize(bytes, prefault, error); }; float get_distance(int32_t i, int32_t j) const { return _index.get_distance(i, j); }; void get_nns_by_item(int32_t item, size_t n, int search_k, vector* result, vector* distances) const { From 2d92ba9ccf405b3bb938522c884a4150d5ce7f79 Mon Sep 17 00:00:00 2001 From: FuexFollets Date: Tue, 6 Feb 2024 17:23:47 -0500 Subject: [PATCH 18/18] Fixed serialization and deserialization --- src/annoylib.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/annoylib.h b/src/annoylib.h index 211717af..a657a714 100644 --- a/src/annoylib.h +++ b/src/annoylib.h @@ -1245,7 +1245,7 @@ template