Skip to content
This repository has been archived by the owner on Sep 13, 2023. It is now read-only.

Commit

Permalink
add tests; add onehotencoder support
Browse files Browse the repository at this point in the history
  • Loading branch information
Yury committed Dec 31, 2022
1 parent 424ee8e commit 1f376d5
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 30 deletions.
12 changes: 6 additions & 6 deletions mlem/contrib/scipy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,32 @@
from mlem.core.requirements import InstallableRequirement, Requirements


class ScipySparceMatrix(
class ScipySparseMatrix(
WithDefaultSerializer, DataType, DataHook, IsInstanceHookMixin
):
type: ClassVar[str] = "csr_matrix"
valid_types: ClassVar = csr_matrix
valid_types: ClassVar = (csr_matrix,)
dtype: str

def get_requirements(self) -> Requirements:
return Requirements.new([InstallableRequirement.from_module(scipy)])

@classmethod
def process(cls, obj: Any, **kwargs) -> DataType:
return ScipySparceMatrix(dtype=obj.dtype.name)
return ScipySparseMatrix(dtype=obj.dtype.name)

def get_writer(
self, project: str = None, filename: str = None, **kwargs
) -> DataWriter:
return ScipyWriter(**kwargs)


class ScipyWriter(DataWriter[[ScipySparceMatrix]]):
class ScipyWriter(DataWriter[ScipySparseMatrix]):
def write(
self, data: DataType, storage: Storage, path: str
) -> Tuple[DataReader[DataType], Artifacts]:
) -> Tuple[DataReader, Artifacts]:
with storage.open(path) as (f, art):
sparse.save_npz(f, art)
sparse.save_npz(f, data.data)
return ScipyReader(data_type=data), {self.art_name: art}


Expand Down
10 changes: 4 additions & 6 deletions mlem/contrib/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

import sklearn
from sklearn.base import ClassifierMixin, RegressorMixin
from sklearn.feature_extraction.text import TransformerMixin, _VectorizerMixin
from sklearn.feature_extraction.text import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing._encoders import _BaseEncoder

from mlem.constants import TRANSFORM_METHOD_NAME
from mlem.core.hooks import IsInstanceHookMixin
Expand Down Expand Up @@ -137,10 +138,7 @@ def process(


class SklearnTransformer(SklearnModel):
valid_types: ClassVar = (
TransformerMixin,
_VectorizerMixin,
)
valid_types: ClassVar = (TransformerMixin, _BaseEncoder)
type: ClassVar = "sklearn_transformer"

@classmethod
Expand All @@ -159,7 +157,7 @@ def process(
TRANSFORM_METHOD_NAME: Signature.from_method(
obj.transform,
auto_infer=sample_data is not None,
raw_documents=sample_data,
X=sample_data,
),
}

Expand Down
32 changes: 32 additions & 0 deletions tests/contrib/test_scipy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import numpy as np
import pytest
from scipy.sparse import csr_matrix

from mlem.contrib.scipy import ScipySparseMatrix
from mlem.core.data_type import DataAnalyzer
from tests.conftest import data_write_read_check


@pytest.fixture
def test_data():
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
return csr_matrix((data, (row, col)), shape=(3, 3), dtype="float32")


def test_sparce_matrix(test_data):
assert ScipySparseMatrix.is_object_valid(test_data)
sdt = DataAnalyzer.analyze(test_data)
assert sdt.dict() == {"dtype": "float32", "type": "csr_matrix"}
assert isinstance(sdt, ScipySparseMatrix)
assert sdt.dtype == "float32"
assert sdt.get_requirements().modules == ["scipy"]


def test_write_read(test_data):
sdt = DataAnalyzer.analyze(test_data)
sdt = sdt.bind(test_data)
data_write_read_check(
sdt, custom_eq=lambda x, y: np.array_equal(x.todense(), y.todense())
)
53 changes: 35 additions & 18 deletions tests/contrib/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
import lightgbm as lgb
import numpy as np
import pytest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC

from mlem.constants import PREDICT_METHOD_NAME, TRANSFORM_METHOD_NAME
from mlem.contrib.numpy import NumpyNdarrayType
from mlem.contrib.scipy import ScipySparceMatrix

# from mlem.contrib.scipy import ScipySparceMatrix
from mlem.contrib.scipy import ScipySparseMatrix
from mlem.contrib.sklearn import SklearnModel, SklearnTransformer
from mlem.core.artifacts import LOCAL_STORAGE
from mlem.core.data_type import DataAnalyzer
Expand All @@ -26,9 +28,9 @@ def inp_data():
return [[1, 2, 3], [3, 2, 1]]


@pytest.fixture
def inp_data_text():
return ["Is that peanut butter on my nose? Mlem!"]
# @pytest.fixture
# def inp_data_text():
# return ["Is that peanut butter on my nose? Mlem!"]


@pytest.fixture
Expand All @@ -51,12 +53,19 @@ def regressor(inp_data, out_data):


@pytest.fixture
def transformer(inp_data_text):
tf_idf = TfidfVectorizer()
tf_idf.fit(inp_data_text)
def transformer(inp_data):
tf_idf = TfidfTransformer()
tf_idf.fit(inp_data)
return tf_idf


@pytest.fixture
def onehotencoder(inp_data):
encoder = OneHotEncoder()
encoder.fit(inp_data)
return encoder


@pytest.fixture()
def pipeline(inp_data, out_data):
pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())])
Expand Down Expand Up @@ -90,24 +99,32 @@ def test_hook(model_fixture, inp_data, request):
assert signature.returns == returns


def test_hook_transformer(transformer, inp_data_text):
data_type = DataAnalyzer.analyze(inp_data_text)
model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data_text)
@pytest.mark.parametrize(
"transformer_fixture", ["transformer", "onehotencoder"]
)
def test_hook_transformer(transformer_fixture, inp_data, request):
transformer = request.getfixturevalue(transformer_fixture)
data_type = DataAnalyzer.analyze(inp_data)
model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data)
assert isinstance(model_type, SklearnTransformer)
assert TRANSFORM_METHOD_NAME in model_type.methods
signature = model_type.methods[TRANSFORM_METHOD_NAME]
returns = ScipySparceMatrix(dtype="float64")
returns = ScipySparseMatrix(dtype="float64")
assert signature.name == TRANSFORM_METHOD_NAME
assert signature.args[0] == Argument(name="raw_documents", type_=data_type)
assert signature.args[0] == Argument(name="X", type_=data_type)
assert signature.returns == returns


def test_model_type__transform(transformer, inp_data_text):
model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data_text)
@pytest.mark.parametrize(
"transformer_fixture", ["transformer", "onehotencoder"]
)
def test_model_type__transform(transformer_fixture, inp_data, request):
transformer = request.getfixturevalue(transformer_fixture)
model_type = ModelAnalyzer.analyze(transformer, sample_data=inp_data)

np.testing.assert_array_almost_equal(
transformer.transform(inp_data_text).todense(),
model_type.call_method("transform", inp_data_text).todense(),
transformer.transform(inp_data).todense(),
model_type.call_method("transform", inp_data).todense(),
)


Expand Down

0 comments on commit 1f376d5

Please sign in to comment.