diff --git a/mlem/api/commands.py b/mlem/api/commands.py index 68b3f3c9..ba093805 100644 --- a/mlem/api/commands.py +++ b/mlem/api/commands.py @@ -32,6 +32,7 @@ MlemLink, MlemModel, MlemObject, + _ModelMethodCall, ) from mlem.runtime.client import Client from mlem.runtime.interface import ModelInterface @@ -85,18 +86,20 @@ def apply( except WrongMethodError: resolved_method = PREDICT_METHOD_NAME echo(EMOJI_APPLY + f"Applying `{resolved_method}` method...") + method_call = _ModelMethodCall( + name=resolved_method, + order=model.call_orders[resolved_method], + model=model, + ) if batch_size: res: Any = [] for part in data: batch_data = get_data_value(part, batch_size) for batch in batch_data: - preds = w.call_method(resolved_method, batch.data) + preds = method_call(batch.data) res += [*preds] # TODO: merge results else: - res = [ - w.call_method(resolved_method, get_data_value(part)) - for part in data - ] + res = [method_call(get_data_value(part)) for part in data] if output is None: if len(res) == 1: return res[0] diff --git a/mlem/constants.py b/mlem/constants.py index 26f3f516..6914825e 100644 --- a/mlem/constants.py +++ b/mlem/constants.py @@ -4,5 +4,6 @@ PREDICT_METHOD_NAME = "predict" PREDICT_PROBA_METHOD_NAME = "predict_proba" PREDICT_ARG_NAME = "data" +TRANSFORM_METHOD_NAME = "transform" MLEM_CONFIG_FILE_NAME = ".mlem.yaml" diff --git a/mlem/contrib/numpy.py b/mlem/contrib/numpy.py index 5861b11f..74c96976 100644 --- a/mlem/contrib/numpy.py +++ b/mlem/contrib/numpy.py @@ -44,6 +44,23 @@ def np_type_from_string(string_repr) -> np.dtype: raise ValueError(f"Unknown numpy type {string_repr}") from e +def check_shape(shape, array, exc_type): + if shape is not None: + if len(array.shape) != len(shape): + raise exc_type( + f"given array is of rank: {len(array.shape)}, expected: {len(shape)}" + ) + + array_shape = tuple( + None if expected_dim is None else array_dim + for array_dim, expected_dim in zip(array.shape, shape) + ) + if tuple(array_shape) != shape: + raise exc_type( + f"given array is of shape: {array_shape}, expected: {shape}" + ) + + class NumpyNumberType( WithDefaultSerializer, LibRequirementsMixin, DataType, DataHook ): @@ -123,22 +140,6 @@ def subtype(self, subshape: Tuple[Optional[int], ...]): max_items=subshape[0], ) - def check_shape(self, array, exc_type): - if self.shape is not None: - if len(array.shape) != len(self.shape): - raise exc_type( - f"given array is of rank: {len(array.shape)}, expected: {len(self.shape)}" - ) - - array_shape = tuple( - None if expected_dim is None else array_dim - for array_dim, expected_dim in zip(array.shape, self.shape) - ) - if tuple(array_shape) != self.shape: - raise exc_type( - f"given array is of shape: {array_shape}, expected: {self.shape}" - ) - def get_writer(self, project: str = None, filename: str = None, **kwargs): return NumpyArrayWriter() @@ -171,7 +172,7 @@ def deserialize(self, data_type, obj): f"given object: {obj} could not be converted to array " f"of type: {np_type_from_string(data_type.dtype)}" ) from e - data_type.check_shape(ret, DeserializationError) + check_shape(data_type.shape, ret, DeserializationError) return ret def serialize(self, data_type, instance: np.ndarray): @@ -181,7 +182,7 @@ def serialize(self, data_type, instance: np.ndarray): raise SerializationError( f"given array is of type: {instance.dtype}, expected: {exp_type}" ) - data_type.check_shape(instance, SerializationError) + check_shape(data_type.shape, instance, SerializationError) return instance.tolist() diff --git a/mlem/contrib/scipy.py b/mlem/contrib/scipy.py new file mode 100644 index 00000000..75c95336 --- /dev/null +++ b/mlem/contrib/scipy.py @@ -0,0 +1,150 @@ +"""Scipy Sparse matrices support +Extension type: data + +DataType, Reader and Writer implementations for `scipy.sparse` +""" +from typing import ClassVar, Iterator, List, Optional, Tuple, Type, Union + +import scipy +from pydantic import BaseModel +from pydantic.main import create_model +from pydantic.types import conlist +from scipy import sparse +from scipy.sparse import spmatrix + +from mlem.contrib.numpy import ( + check_shape, + np_type_from_string, + python_type_from_np_string_repr, +) +from mlem.core.artifacts import Artifacts, Storage +from mlem.core.data_type import ( + DataHook, + DataReader, + DataSerializer, + DataType, + DataWriter, + WithDefaultSerializer, +) +from mlem.core.errors import DeserializationError, SerializationError +from mlem.core.hooks import IsInstanceHookMixin +from mlem.core.requirements import InstallableRequirement, Requirements + + +class ScipySparseMatrix( + WithDefaultSerializer, DataType, DataHook, IsInstanceHookMixin +): + """ + DataType implementation for scipy sparse matrix + """ + + type: ClassVar[str] = "csr_matrix" + valid_types: ClassVar = (spmatrix,) + shape: Optional[Tuple] + """Shape of `sparse.csr_matrix` object in data""" + dtype: str + """Dtype of `sparse.csr_matrix` object in data""" + + def get_requirements(self) -> Requirements: + return Requirements.new([InstallableRequirement.from_module(scipy)]) + + @classmethod + def process(cls, obj: sparse.csr_matrix, **kwargs) -> DataType: + return ScipySparseMatrix(dtype=obj.dtype.name, shape=obj.shape) + + def get_writer( + self, project: str = None, filename: str = None, **kwargs + ) -> DataWriter: + return ScipyWriter(**kwargs) + + def subtype(self, subshape: Tuple[Optional[int], ...]): + if len(subshape) == 0: + return python_type_from_np_string_repr(self.dtype) + return conlist( + self.subtype(subshape[1:]), + min_items=subshape[0], + max_items=subshape[0], + ) + + +class ScipyWriter(DataWriter[ScipySparseMatrix]): + """ + Write scipy matrix to npz format + """ + + type: ClassVar[str] = "csr_matrix" + + def write( + self, data: DataType, storage: Storage, path: str + ) -> Tuple[DataReader, Artifacts]: + with storage.open(path) as (f, art): + sparse.save_npz(f, data.data) + return ScipyReader(data_type=data), {self.art_name: art} + + +class ScipyReader(DataReader): + """ + Read scipy matrix from npz format + """ + + type: ClassVar[str] = "csr_matrix" + + def read_batch( + self, artifacts: Artifacts, batch_size: int + ) -> Iterator[DataType]: + raise NotImplementedError + + def read(self, artifacts: Artifacts) -> Iterator[DataType]: + if DataWriter.art_name not in artifacts: + raise ValueError( + f"Wrong artifacts {artifacts}: should be one {DataWriter.art_name} file" + ) + with artifacts[DataWriter.art_name].open() as f: + data = sparse.load_npz(f) + return self.data_type.copy().bind(data) + + +class ScipySparseMatrixSerializer(DataSerializer[ScipySparseMatrix]): + """ + Serializer for scipy sparse matrices + """ + + is_default: ClassVar = True + data_class: ClassVar = ScipySparseMatrix + + def get_model( + self, data_type: ScipySparseMatrix, prefix: str = "" + ) -> Union[Type[BaseModel], type]: + item_type = List[data_type.subtype(data_type.shape[1:])] # type: ignore[index] + return create_model( + prefix + "ScipySparse", + __root__=(item_type, ...), + ) + + def serialize(self, data_type: ScipySparseMatrix, instance: spmatrix): + data_type.check_type(instance, sparse.csr_matrix, SerializationError) + if instance.dtype != np_type_from_string(data_type.dtype): + raise SerializationError( + f"given matrix is of dtype: {instance.dtype}, " + f"expected: {data_type.dtype}" + ) + check_shape(data_type.shape, instance, SerializationError) + coordinate_matrix = instance.tocoo() + data = coordinate_matrix.data + row = coordinate_matrix.row + col = coordinate_matrix.col + return data, (row, col) + + def deserialize(self, data_type, obj) -> sparse.csr_matrix: + + try: + mat = sparse.csr_matrix( + obj, dtype=data_type.dtype, shape=data_type.shape + ) + except ValueError as e: + raise DeserializationError( + f"Given object {obj} could not be converted" + f"to sparse matrix of type: {data_type.type}" + ) from e + check_shape(data_type.shape, mat, DeserializationError) + return mat diff --git a/mlem/contrib/sklearn.py b/mlem/contrib/sklearn.py index 86aea130..99a81102 100644 --- a/mlem/contrib/sklearn.py +++ b/mlem/contrib/sklearn.py @@ -7,8 +7,11 @@ import sklearn from sklearn.base import ClassifierMixin, RegressorMixin +from sklearn.feature_extraction.text import TransformerMixin from sklearn.pipeline import Pipeline +from sklearn.preprocessing._encoders import _BaseEncoder +from mlem.constants import TRANSFORM_METHOD_NAME from mlem.core.hooks import IsInstanceHookMixin from mlem.core.model import ( ModelHook, @@ -132,3 +135,36 @@ def process( **predict_proba_args ) return mt + + +class SklearnTransformer(SklearnModel): + """ + Model Type implementation for sklearn transformers + """ + + valid_types: ClassVar = (TransformerMixin, _BaseEncoder) + type: ClassVar = "sklearn_transformer" + + @classmethod + def process( + cls, + obj: Any, + sample_data: Optional[Any] = None, + methods_sample_data: Optional[Dict[str, Any]] = None, + **kwargs + ) -> ModelType: + methods_sample_data = methods_sample_data or {} + sample_data = methods_sample_data.get( + TRANSFORM_METHOD_NAME, sample_data + ) + methods = { + TRANSFORM_METHOD_NAME: Signature.from_method( + obj.transform, + auto_infer=sample_data is not None, + X=sample_data, + ), + } + + return SklearnTransformer(io=SimplePickleIO(), methods=methods).bind( + obj + ) diff --git a/mlem/ext.py b/mlem/ext.py index e2f3ff34..59c08080 100644 --- a/mlem/ext.py +++ b/mlem/ext.py @@ -129,6 +129,7 @@ class ExtensionLoader: False, ), Extension("mlem.contrib.git", ["pygit2"], True), + Extension("mlem.contrib.scipy", ["scipy"], False), Extension( "mlem.contrib.flyio", ["docker", "fastapi", "uvicorn"], False ), diff --git a/setup.py b/setup.py index a69acb1e..f3b3754f 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,7 @@ "pandas": ["pandas"], "numpy": ["numpy"], "sklearn": ["scikit-learn"], + "scipy": ["scipy"], "onnx": ["onnx"], "onnxruntime": [ "protobuf==3.20.1", @@ -246,6 +247,11 @@ "serializer.xgboost_dmatrix = mlem.contrib.xgboost:DMatrixSerializer", "model_type.xgboost = mlem.contrib.xgboost:XGBoostModel", "model_io.xgboost_io = mlem.contrib.xgboost:XGBoostModelIO", + "data_type.csr_matrix = mlem.contrib.scipy:ScipySparseMatrix", + "data_writer.csr_matrix = mlem.contrib.scipy:ScipyWriter", + "data_reader.csr_matrix = mlem.contrib.scipy:ScipyReader", + "model_type.sklearn_transformer = mlem.contrib.sklearn:SklearnTransformer", + "serializer.csr_matrix = mlem.contrib.scipy:ScipySparseMatrixSerializer", ], "mlem.config": [ "core = mlem.config:MlemConfig", diff --git a/tests/contrib/test_scipy.py b/tests/contrib/test_scipy.py new file mode 100644 index 00000000..436a511a --- /dev/null +++ b/tests/contrib/test_scipy.py @@ -0,0 +1,97 @@ +import numpy as np +import pytest +from scipy.sparse import csr_matrix + +from mlem.contrib.scipy import ScipySparseMatrix +from mlem.core.data_type import DataAnalyzer +from mlem.core.errors import DeserializationError, SerializationError +from tests.conftest import data_write_read_check + + +@pytest.fixture +def raw_data(): + row = np.array([0, 0, 1, 2, 2, 2]) + col = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + return data, (row, col) + + +@pytest.fixture +def sparse_mat(raw_data): + return csr_matrix(raw_data, shape=(3, 3), dtype="float32") + + +@pytest.fixture +def schema(): + return { + "title": "ScipySparse", + "type": "array", + "items": { + "type": "array", + "items": {"type": "number"}, + "minItems": 3, + "maxItems": 3, + }, + } + + +@pytest.fixture +def sparse_data_type(sparse_mat): + return DataAnalyzer.analyze(sparse_mat) + + +def test_sparce_matrix(sparse_mat, schema): + assert ScipySparseMatrix.is_object_valid(sparse_mat) + sdt = DataAnalyzer.analyze(sparse_mat) + assert sdt.dict() == { + "dtype": "float32", + "type": "csr_matrix", + "shape": (3, 3), + } + model = sdt.get_model() + assert model.__name__ == "ScipySparse" + assert model.schema() == schema + assert isinstance(sdt, ScipySparseMatrix) + assert sdt.dtype == "float32" + assert sdt.get_requirements().modules == ["scipy"] + + +def test_serialization(raw_data, sparse_mat): + sdt = DataAnalyzer.analyze(sparse_mat) + payload = sdt.serialize(sparse_mat) + deserialized_data = sdt.deserialize(payload) + assert np.array_equal(sparse_mat.todense(), deserialized_data.todense()) + + +def test_write_read(sparse_mat): + sdt = DataAnalyzer.analyze(sparse_mat) + sdt = sdt.bind(sparse_mat) + data_write_read_check( + sdt, custom_eq=lambda x, y: np.array_equal(x.todense(), y.todense()) + ) + + +@pytest.mark.parametrize( + "obj", + [ + 1, # wrong type + csr_matrix( + ([1], ([1], [0])), shape=(3, 3), dtype="float64" + ), # wrong dtype + csr_matrix( + ([1], ([1], [0])), shape=(2, 2), dtype="float32" + ), # wrong shape + ], +) +def test_serialize_failure(sparse_mat, obj): + sdt = DataAnalyzer.analyze(sparse_mat) + with pytest.raises(SerializationError): + sdt.serialize(obj) + + +@pytest.mark.parametrize( + "obj", [1, ([1, 1], ([0, 6], [1, 6]))] # wrong type # wrong shape +) +def test_desiarilze_failure(sparse_data_type, obj): + with pytest.raises(DeserializationError): + sparse_data_type.deserialize(obj) diff --git a/tests/contrib/test_sklearn.py b/tests/contrib/test_sklearn.py index da456667..a7105108 100644 --- a/tests/contrib/test_sklearn.py +++ b/tests/contrib/test_sklearn.py @@ -3,14 +3,17 @@ import lightgbm as lgb import numpy as np import pytest +from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.svm import SVC -from mlem.constants import PREDICT_METHOD_NAME +from mlem.api import apply, load_meta, save +from mlem.constants import PREDICT_METHOD_NAME, TRANSFORM_METHOD_NAME from mlem.contrib.numpy import NumpyNdarrayType -from mlem.contrib.sklearn import SklearnModel +from mlem.contrib.scipy import ScipySparseMatrix +from mlem.contrib.sklearn import SklearnModel, SklearnTransformer from mlem.core.artifacts import LOCAL_STORAGE from mlem.core.data_type import DataAnalyzer from mlem.core.model import Argument, ModelAnalyzer @@ -43,6 +46,20 @@ def regressor(inp_data, out_data): return lr +@pytest.fixture +def transformer(inp_data): + tf_idf = TfidfTransformer() + tf_idf.fit(inp_data) + return tf_idf + + +@pytest.fixture +def onehotencoder(inp_data): + encoder = OneHotEncoder() + encoder.fit(inp_data) + return encoder + + @pytest.fixture() def pipeline(inp_data, out_data): pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())]) @@ -76,6 +93,59 @@ def test_hook(model_fixture, inp_data, request): assert signature.returns == returns +@pytest.mark.parametrize( + "transformer_fixture", ["transformer", "onehotencoder"] +) +def test_hook_transformer(transformer_fixture, inp_data, request): + transformer = request.getfixturevalue(transformer_fixture) + data_type = DataAnalyzer.analyze(inp_data) + model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data) + assert isinstance(model_type, SklearnTransformer) + assert TRANSFORM_METHOD_NAME in model_type.methods + signature = model_type.methods[TRANSFORM_METHOD_NAME] + cols = len(transformer.get_feature_names_out()) + rows = len(inp_data) + returns = ScipySparseMatrix(dtype="float64", shape=(rows, cols)) + assert signature.name == TRANSFORM_METHOD_NAME + assert signature.args[0] == Argument(name="X", type_=data_type) + assert signature.returns == returns + + +@pytest.mark.parametrize( + "transformer_fixture", ["transformer", "onehotencoder"] +) +def test_model_type__transform(transformer_fixture, inp_data, request): + transformer = request.getfixturevalue(transformer_fixture) + model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data) + + np.testing.assert_array_almost_equal( + transformer.transform(inp_data).todense(), + model_type.call_method("transform", inp_data).todense(), + ) + + +@pytest.mark.parametrize( + "transformer_fixture", ["transformer", "onehotencoder"] +) +def test_preprocess_transformer( + classifier, transformer_fixture, inp_data, tmpdir, out_data, request +): + transformer = request.getfixturevalue(transformer_fixture) + model_file = "clf" + clf = LogisticRegression() + train_data = transformer.transform(inp_data) + clf.fit(train_data, out_data) + save( + clf, + str(tmpdir / model_file), + sample_data=inp_data, + preprocess=transformer, + ) + clf = load_meta(str(tmpdir / model_file)) + output = apply(clf, inp_data) + assert np.array_equal(output, out_data) + + def test_hook_lgb(lgbm_model, inp_data): data_type = DataAnalyzer.analyze(inp_data) model_type = ModelAnalyzer.analyze(lgbm_model, sample_data=inp_data)