Skip to content

Commit

Permalink
Merge pull request #125 from monarch-initiative/prep_for_v0_2_3
Browse files Browse the repository at this point in the history
Prep for v0.2.3 release
  • Loading branch information
caufieldjh authored Jan 16, 2025
2 parents 966f1af + 38d44d1 commit 3763ae9
Show file tree
Hide file tree
Showing 7 changed files with 1,493 additions and 1,555 deletions.
2,826 changes: 1,382 additions & 1,444 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "curategpt"
version = "0.2.2"
version = "0.2.3"
description = "CurateGPT"
authors = ["Chris Mungall <[email protected]>", "Carlo Kroll <[email protected]>", "Harshad Hegde <[email protected]>", "J. Harry Caufield <[email protected]>"]
license = "BSD-3"
Expand Down
9 changes: 4 additions & 5 deletions src/curategpt/store/db_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,7 @@ def collection_metadata(
:return:
"""

def set_collection_metadata(
self, collection_name: Optional[str], metadata: Metadata, **kwargs
):
def set_collection_metadata(self, collection_name: Optional[str], metadata: Metadata, **kwargs):
"""
Set the metadata for a collection.
Expand Down Expand Up @@ -488,6 +486,7 @@ def dump_then_load(self, collection: str = None, target: "DBAdapter" = None):
"""
raise NotImplementedError

def insert_from_huggingface(self, objs: Union[OBJECT, Iterable[OBJECT]], collection: str = None, **kwargs):
def insert_from_huggingface(
self, objs: Union[OBJECT, Iterable[OBJECT]], collection: str = None, **kwargs
):
raise NotImplementedError

140 changes: 76 additions & 64 deletions src/curategpt/store/duckdb_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,9 @@ def _process_objects(

if collection not in self.list_collection_names():
self._create_table_if_not_exists(
collection, self.vec_dimension, venomx=updated_venomx,
collection,
self.vec_dimension,
venomx=updated_venomx,
)

# if collection already exists, update metadata here
Expand Down Expand Up @@ -409,17 +411,17 @@ def _process_objects(
self.create_index(collection)

def insert_from_huggingface(
self,
objs: Union[OBJECT, Iterable[OBJECT]],
collection: str = None,
batch_size: int = None,
text_field: Union[str, Callable] = None,
venomx: Optional[Metadata] = None,
object_type: Optional[str] = None,
distance: Optional[str] = None,
vec_dimension: Optional[int] = None,
method: str = "insert",
**kwargs,
self,
objs: Union[OBJECT, Iterable[OBJECT]],
collection: str = None,
batch_size: int = None,
text_field: Union[str, Callable] = None,
venomx: Optional[Metadata] = None,
object_type: Optional[str] = None,
distance: Optional[str] = None,
vec_dimension: Optional[int] = None,
method: str = "insert",
**kwargs,
):
collection = self._get_collection(collection)
model = None
Expand All @@ -434,7 +436,9 @@ def insert_from_huggingface(
vec_dimension = self._get_embedding_dimension(model)

except Exception as e:
raise KeyError(f"Metadata from {collection} is not compatible with the current version of CurateGPT") from e
raise KeyError(
f"Metadata from {collection} is not compatible with the current version of CurateGPT"
) from e

updated_venomx = self.update_or_create_venomx(
venomx.venomx,
Expand All @@ -446,9 +450,11 @@ def insert_from_huggingface(
)
if collection not in self.list_collection_names():
self._create_table_if_not_exists(
collection, vec_dimension, venomx=updated_venomx,
collection,
vec_dimension,
venomx=updated_venomx,
)
updated_venomx.venomx.id = collection # prevent name error
updated_venomx.venomx.id = collection # prevent name error
self.set_collection_metadata(collection_name=collection, metadata=updated_venomx)
if batch_size is None:
batch_size = 100000
Expand All @@ -464,11 +470,17 @@ def insert_from_huggingface(

for next_objs in chunk(objs, batch_size):
next_objs = list(next_objs)
ids = [item['metadata']['id'] for item in next_objs]
ids = [item["metadata"]["id"] for item in next_objs]
metadatas = [self._dict(o) for o in next_objs]
documents = [item['document'] for item in next_objs]
embeddings = [item['embeddings'].tolist() if isinstance(item['embeddings'], np.ndarray)
else item['embeddings'] for item in next_objs]
documents = [item["document"] for item in next_objs]
embeddings = [
(
item["embeddings"].tolist()
if isinstance(item["embeddings"], np.ndarray)
else item["embeddings"]
)
for item in next_objs
]
try:
self.conn.execute("BEGIN TRANSACTION;")
self.conn.executemany(
Expand All @@ -484,67 +496,65 @@ def insert_from_huggingface(
finally:
self.create_index(collection)



def update_or_create_venomx(
self,
venomx: Optional[Index],
collection: str,
model: str,
distance: str,
object_type: str,
embeddings_dimension: Optional[int],
self,
venomx: Optional[Index],
collection: str,
model: str,
distance: str,
object_type: str,
embeddings_dimension: Optional[int],
) -> Metadata:
"""
Updates an existing Index instance (venomx) with additional values or creates a new one if none is provided.
"""
# If venomx already exists, update its nested fields (as e.g. vec_dimension would not be given)
if venomx:
new_embedding_model = Model(name=model)
updated_index = venomx.model_copy(update={ # given venomx comes as venomx=Index()
"embedding_model": new_embedding_model,
"embeddings_dimensions": embeddings_dimension,
})

venomx = Metadata(
venomx=updated_index,
hnsw_space=distance,
object_type=object_type
updated_index = venomx.model_copy(
update={ # given venomx comes as venomx=Index()
"embedding_model": new_embedding_model,
"embeddings_dimensions": embeddings_dimension,
}
)

venomx = Metadata(venomx=updated_index, hnsw_space=distance, object_type=object_type)

else:
if distance is None:
distance = self.distance_metric
venomx = self.populate_venomx(collection, model, distance, object_type, embeddings_dimension)
venomx = self.populate_venomx(
collection, model, distance, object_type, embeddings_dimension
)

return venomx

@staticmethod
def populate_venomx(
collection: Optional[str],
model: Optional[str],
distance: str,
object_type: str,
embeddings_dimension: int,
collection: Optional[str],
model: Optional[str],
distance: str,
object_type: str,
embeddings_dimension: int,
) -> Metadata:
"""
Populate venomx with data currently given when inserting
Populate venomx with data currently given when inserting
:param collection:
:param model:
:param distance:
:param object_type:
:param embeddings_dimension:
:return:
"""
:param collection:
:param model:
:param distance:
:param object_type:
:param embeddings_dimension:
:return:
"""
venomx = Metadata(
venomx=Index(
id=collection,
embedding_model=Model(name=model),
embeddings_dimensions=embeddings_dimension,
),
hnsw_space=distance,
object_type=object_type
object_type=object_type,
)
return venomx

Expand Down Expand Up @@ -764,15 +774,17 @@ def update_collection_metadata(self, collection: str, **kwargs):
raise ValueError("Collection name must be provided.")
metadata = self.collection_metadata(collection)
current_venomx = {**kwargs}
if metadata is None: # should not be possible
logger.warning(f"No existing metadata found for collection {collection}. Initializing new metadata.")
if metadata is None: # should not be possible
logger.warning(
f"No existing metadata found for collection {collection}. Initializing new metadata."
)
metadata = Metadata(venomx=Index(**current_venomx))
else:
metadata_dict = metadata.model_dump(exclude_none=True)
# Check if the existing venomx has an embedding model and if it matches the one in kwargs
if 'venomx' in metadata_dict and metadata_dict['venomx'].get('embedding_model'):
existing_model_name = metadata_dict['venomx']['embedding_model'].get('name')
new_model_name = current_venomx.get('embedding_model', {}).get('name')
if "venomx" in metadata_dict and metadata_dict["venomx"].get("embedding_model"):
existing_model_name = metadata_dict["venomx"]["embedding_model"].get("name")
new_model_name = current_venomx.get("embedding_model", {}).get("name")

if new_model_name and existing_model_name and new_model_name != existing_model_name:
raise ValueError(
Expand All @@ -781,10 +793,10 @@ def update_collection_metadata(self, collection: str, **kwargs):
)

# Merge current_venomx (from kwargs) into the nested venomx dictionary
if 'venomx' in metadata_dict and isinstance(metadata_dict['venomx'], dict):
metadata_dict['venomx'].update(current_venomx)
if "venomx" in metadata_dict and isinstance(metadata_dict["venomx"], dict):
metadata_dict["venomx"].update(current_venomx)
else:
metadata_dict['venomx'] = current_venomx
metadata_dict["venomx"] = current_venomx
# Reconstruct the Metadata object from the updated dictionary
metadata = Metadata(**metadata_dict)
updated_metadata_dict = metadata.model_dump(exclude_none=True)
Expand All @@ -799,9 +811,7 @@ def update_collection_metadata(self, collection: str, **kwargs):
)
return metadata

def set_collection_metadata(
self, collection_name: Optional[str], metadata: Metadata, **kwargs
):
def set_collection_metadata(self, collection_name: Optional[str], metadata: Metadata, **kwargs):
"""
Set the metadata for the collection
:param collection_name:
Expand All @@ -816,7 +826,9 @@ def set_collection_metadata(

if metadata:
if metadata.venomx.id != collection_name:
raise ValueError(f"venomx.id: {metadata.venomx.id} must match collection_name {collection_name}")
raise ValueError(
f"venomx.id: {metadata.venomx.id} must match collection_name {collection_name}"
)

new_model = metadata.venomx.embedding_model.name

Expand Down
5 changes: 3 additions & 2 deletions src/curategpt/store/duckdb_connection_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def _kill_process(pid: int) -> None:
"""Kill a process if it's holding the database lock."""
try:
import psutil

if psutil.pid_exists(pid):
process = psutil.Process(pid)
process.terminate()
Expand Down Expand Up @@ -65,11 +66,11 @@ def connect(self) -> duckdb.DuckDBPyConnection:
- Now safely open the fixed database normally
"""
wal_path = Path(self.path + '.wal')
wal_path = Path(self.path + ".wal")
if wal_path.exists():
logger.info("Found WAL file, attempting recovery...")
try:
temp_conn = duckdb.connect(':memory:')
temp_conn = duckdb.connect(":memory:")
self._load_vss_extensions(temp_conn)
temp_conn.execute(f"ATTACH '{self.path}' AS main_db")
temp_conn.execute("CHECKPOINT;")
Expand Down
47 changes: 21 additions & 26 deletions src/curategpt/store/in_memory_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,8 @@ def insert(self, objs: Union[OBJECT, Iterable[OBJECT]], collection: str = None,
"""
self._insert(objs, collection, **kwargs)


def _insert(
self,
objs: Union[OBJECT, Iterable[OBJECT]],
collection: str = None,
venomx: Metadata = None
self, objs: Union[OBJECT, Iterable[OBJECT]], collection: str = None, venomx: Metadata = None
):
collection_obj = self._get_collection_object(collection)
if venomx is None:
Expand All @@ -121,33 +117,35 @@ def _insert(

@staticmethod
def populate_venomx(
collection: Optional[str],
model: Optional[str] = None,
distance: str = None,
object_type: str = None,
embeddings_dimension: int = None,
index_fields: Optional[Union[List[str], Tuple[str]]] = None,
collection: Optional[str],
model: Optional[str] = None,
distance: str = None,
object_type: str = None,
embeddings_dimension: int = None,
index_fields: Optional[Union[List[str], Tuple[str]]] = None,
) -> Metadata:
"""
Populate venomx with data currently given when inserting
Populate venomx with data currently given when inserting
:param collection:
:param model:
:param distance:
:param object_type:
:param embeddings_dimension:
:param index_fields:
:return:
"""
:param collection:
:param model:
:param distance:
:param object_type:
:param embeddings_dimension:
:param index_fields:
:return:
"""
venomx = Metadata(
venomx=Index(
id=collection,
embedding_model=Model(name=model),
embeddings_dimensions=embeddings_dimension,
embedding_input_method=ModelInputMethod(fields=index_fields) if index_fields else None
embedding_input_method=(
ModelInputMethod(fields=index_fields) if index_fields else None
),
),
hnsw_space=distance,
object_type=object_type
object_type=object_type,
)
return venomx

Expand Down Expand Up @@ -200,9 +198,7 @@ def collection_metadata(
cm.object_count = len(collection_obj.objects)
return cm

def set_collection_metadata(
self, collection_name: Optional[str], metadata: Metadata, **kwargs
):
def set_collection_metadata(self, collection_name: Optional[str], metadata: Metadata, **kwargs):
"""
Set the metadata for a collection.
Expand All @@ -215,7 +211,6 @@ def set_collection_metadata(
# raise ValueError(f"venomx.id: {metadata.venomx.id} must match collection_name {collection_name} and should not be changed")
collection_obj.metadata = metadata.model_dump(exclude_none=True)


def update_collection_metadata(self, collection_name: str, **kwargs) -> Metadata:
"""
Update the metadata for a collection.
Expand Down
Loading

0 comments on commit 3763ae9

Please sign in to comment.