From 1f376d5fc1822ea370e339482813cb1fa0113d20 Mon Sep 17 00:00:00 2001 From: Yury Date: Sat, 31 Dec 2022 16:17:58 +0100 Subject: [PATCH] add tests; add onehotencoder support --- mlem/contrib/scipy.py | 12 ++++---- mlem/contrib/sklearn.py | 10 +++---- tests/contrib/test_scipy.py | 32 +++++++++++++++++++++ tests/contrib/test_sklearn.py | 53 +++++++++++++++++++++++------------ 4 files changed, 77 insertions(+), 30 deletions(-) create mode 100644 tests/contrib/test_scipy.py diff --git a/mlem/contrib/scipy.py b/mlem/contrib/scipy.py index d48b0c35..97a60f34 100644 --- a/mlem/contrib/scipy.py +++ b/mlem/contrib/scipy.py @@ -16,11 +16,11 @@ from mlem.core.requirements import InstallableRequirement, Requirements -class ScipySparceMatrix( +class ScipySparseMatrix( WithDefaultSerializer, DataType, DataHook, IsInstanceHookMixin ): type: ClassVar[str] = "csr_matrix" - valid_types: ClassVar = csr_matrix + valid_types: ClassVar = (csr_matrix,) dtype: str def get_requirements(self) -> Requirements: @@ -28,7 +28,7 @@ def get_requirements(self) -> Requirements: @classmethod def process(cls, obj: Any, **kwargs) -> DataType: - return ScipySparceMatrix(dtype=obj.dtype.name) + return ScipySparseMatrix(dtype=obj.dtype.name) def get_writer( self, project: str = None, filename: str = None, **kwargs @@ -36,12 +36,12 @@ def get_writer( return ScipyWriter(**kwargs) -class ScipyWriter(DataWriter[[ScipySparceMatrix]]): +class ScipyWriter(DataWriter[ScipySparseMatrix]): def write( self, data: DataType, storage: Storage, path: str - ) -> Tuple[DataReader[DataType], Artifacts]: + ) -> Tuple[DataReader, Artifacts]: with storage.open(path) as (f, art): - sparse.save_npz(f, art) + sparse.save_npz(f, data.data) return ScipyReader(data_type=data), {self.art_name: art} diff --git a/mlem/contrib/sklearn.py b/mlem/contrib/sklearn.py index af6928eb..3e3afc24 100644 --- a/mlem/contrib/sklearn.py +++ b/mlem/contrib/sklearn.py @@ -7,8 +7,9 @@ import sklearn from sklearn.base import ClassifierMixin, RegressorMixin -from sklearn.feature_extraction.text import TransformerMixin, _VectorizerMixin +from sklearn.feature_extraction.text import TransformerMixin from sklearn.pipeline import Pipeline +from sklearn.preprocessing._encoders import _BaseEncoder from mlem.constants import TRANSFORM_METHOD_NAME from mlem.core.hooks import IsInstanceHookMixin @@ -137,10 +138,7 @@ def process( class SklearnTransformer(SklearnModel): - valid_types: ClassVar = ( - TransformerMixin, - _VectorizerMixin, - ) + valid_types: ClassVar = (TransformerMixin, _BaseEncoder) type: ClassVar = "sklearn_transformer" @classmethod @@ -159,7 +157,7 @@ def process( TRANSFORM_METHOD_NAME: Signature.from_method( obj.transform, auto_infer=sample_data is not None, - raw_documents=sample_data, + X=sample_data, ), } diff --git a/tests/contrib/test_scipy.py b/tests/contrib/test_scipy.py new file mode 100644 index 00000000..86efed0e --- /dev/null +++ b/tests/contrib/test_scipy.py @@ -0,0 +1,32 @@ +import numpy as np +import pytest +from scipy.sparse import csr_matrix + +from mlem.contrib.scipy import ScipySparseMatrix +from mlem.core.data_type import DataAnalyzer +from tests.conftest import data_write_read_check + + +@pytest.fixture +def test_data(): + row = np.array([0, 0, 1, 2, 2, 2]) + col = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + return csr_matrix((data, (row, col)), shape=(3, 3), dtype="float32") + + +def test_sparce_matrix(test_data): + assert ScipySparseMatrix.is_object_valid(test_data) + sdt = DataAnalyzer.analyze(test_data) + assert sdt.dict() == {"dtype": "float32", "type": "csr_matrix"} + assert isinstance(sdt, ScipySparseMatrix) + assert sdt.dtype == "float32" + assert sdt.get_requirements().modules == ["scipy"] + + +def test_write_read(test_data): + sdt = DataAnalyzer.analyze(test_data) + sdt = sdt.bind(test_data) + data_write_read_check( + sdt, custom_eq=lambda x, y: np.array_equal(x.todense(), y.todense()) + ) diff --git a/tests/contrib/test_sklearn.py b/tests/contrib/test_sklearn.py index 62a295c1..4245b8d5 100644 --- a/tests/contrib/test_sklearn.py +++ b/tests/contrib/test_sklearn.py @@ -3,15 +3,17 @@ import lightgbm as lgb import numpy as np import pytest -from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.svm import SVC from mlem.constants import PREDICT_METHOD_NAME, TRANSFORM_METHOD_NAME from mlem.contrib.numpy import NumpyNdarrayType -from mlem.contrib.scipy import ScipySparceMatrix + +# from mlem.contrib.scipy import ScipySparceMatrix +from mlem.contrib.scipy import ScipySparseMatrix from mlem.contrib.sklearn import SklearnModel, SklearnTransformer from mlem.core.artifacts import LOCAL_STORAGE from mlem.core.data_type import DataAnalyzer @@ -26,9 +28,9 @@ def inp_data(): return [[1, 2, 3], [3, 2, 1]] -@pytest.fixture -def inp_data_text(): - return ["Is that peanut butter on my nose? Mlem!"] +# @pytest.fixture +# def inp_data_text(): +# return ["Is that peanut butter on my nose? Mlem!"] @pytest.fixture @@ -51,12 +53,19 @@ def regressor(inp_data, out_data): @pytest.fixture -def transformer(inp_data_text): - tf_idf = TfidfVectorizer() - tf_idf.fit(inp_data_text) +def transformer(inp_data): + tf_idf = TfidfTransformer() + tf_idf.fit(inp_data) return tf_idf +@pytest.fixture +def onehotencoder(inp_data): + encoder = OneHotEncoder() + encoder.fit(inp_data) + return encoder + + @pytest.fixture() def pipeline(inp_data, out_data): pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())]) @@ -90,24 +99,32 @@ def test_hook(model_fixture, inp_data, request): assert signature.returns == returns -def test_hook_transformer(transformer, inp_data_text): - data_type = DataAnalyzer.analyze(inp_data_text) - model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data_text) +@pytest.mark.parametrize( + "transformer_fixture", ["transformer", "onehotencoder"] +) +def test_hook_transformer(transformer_fixture, inp_data, request): + transformer = request.getfixturevalue(transformer_fixture) + data_type = DataAnalyzer.analyze(inp_data) + model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data) assert isinstance(model_type, SklearnTransformer) assert TRANSFORM_METHOD_NAME in model_type.methods signature = model_type.methods[TRANSFORM_METHOD_NAME] - returns = ScipySparceMatrix(dtype="float64") + returns = ScipySparseMatrix(dtype="float64") assert signature.name == TRANSFORM_METHOD_NAME - assert signature.args[0] == Argument(name="raw_documents", type_=data_type) + assert signature.args[0] == Argument(name="X", type_=data_type) assert signature.returns == returns -def test_model_type__transform(transformer, inp_data_text): - model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data_text) +@pytest.mark.parametrize( + "transformer_fixture", ["transformer", "onehotencoder"] +) +def test_model_type__transform(transformer_fixture, inp_data, request): + transformer = request.getfixturevalue(transformer_fixture) + model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data) np.testing.assert_array_almost_equal( - transformer.transform(inp_data_text).todense(), - model_type.call_method("transform", inp_data_text).todense(), + transformer.transform(inp_data).todense(), + model_type.call_method("transform", inp_data).todense(), )