Skip to content

Commit

Permalink
Add/upgrade string indexing tests (#294)
Browse files Browse the repository at this point in the history
  • Loading branch information
crupley authored and tovbinm committed Apr 18, 2019
1 parent f984f21 commit ffb9521
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,53 +30,26 @@

package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.types._
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
import com.salesforce.op.utils.spark.RichDataset._
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Assertions, FlatSpec, Matchers}


@RunWith(classOf[JUnitRunner])
class OpIndexToStringNoFilterTest extends FlatSpec with TestSparkContext {
class OpIndexToStringNoFilterTest extends OpTransformerSpec[Text, OpIndexToStringNoFilter] {
val (inputData, indF) = TestFeatureBuilder(Seq(0.0, 2.0, 1.0, 0.0, 0.0, 1.0).map(_.toRealNN))
val labels = Array("a", "c")

val (ds, indF) = TestFeatureBuilder(Seq(0.0, 2.0, 1.0, 0.0, 0.0, 1.0).map(_.toRealNN))
val labels = Array("a", "c", "b")
val expected = Array("a", "b", "c", "a", "a", "c").map(_.toText)
override val transformer: OpIndexToStringNoFilter = new OpIndexToStringNoFilter().setInput(indF).setLabels(labels)

val labelsNew = Array("a", "c")
val expectedNew = Array("a", OpIndexToStringNoFilter.unseenDefault, "c", "a", "a", "c").map(_.toText)
override val expectedResult: Seq[Text] =
Array("a", OpIndexToStringNoFilter.unseenDefault, "c", "a", "a", "c").map(_.toText)

Spec[OpIndexToStringNoFilter] should "correctly deindex a numeric column" in {
val indexToStr = new OpIndexToStringNoFilter().setInput(indF).setLabels(labels)
val strs = indexToStr.transform(ds).collect(indexToStr.getOutput())

strs shouldBe expected
}

it should "correctly deindex a numeric column (shortcut)" in {
val str = indF.deindexed(labels)
val strs = str.originStage.asInstanceOf[OpIndexToStringNoFilter].transform(ds).collect(str)
strs shouldBe expected

val str2 = indF.deindexed(labels, handleInvalid = IndexToStringHandleInvalid.Error)
val strs2 = str2.originStage.asInstanceOf[OpIndexToString].transform(ds).collect(str2)
strs2 shouldBe expected
}

it should "correctly deindex even if the lables list does not match the number of indicies" in {
val indexToStr = new OpIndexToStringNoFilter().setInput(indF).setLabels(labelsNew)
val strs = indexToStr.transform(ds).collect(indexToStr.getOutput())

strs shouldBe expectedNew
}

Spec[OpIndexToString] should "correctly deindex a numeric column" in {
val indexToStr = new OpIndexToString().setInput(indF).setLabels(labels)
val strs = indexToStr.transform(ds).collect(indexToStr.getOutput())

strs shouldBe expected
it should "correctly deindex a numeric column using shortcut" in {
val str2 = indF.deindexed(labels, handleInvalid = IndexToStringHandleInvalid.NoFilter)
val strs2 = str2.originStage.asInstanceOf[OpIndexToStringNoFilter].transform(inputData).collect(str2)
strs2 shouldBe expectedResult
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright (c) 2017, Salesforce.com, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package com.salesforce.op.stages.impl.feature

import com.salesforce.op.features.types._
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.utils.spark.RichDataset._
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class OpIndexToStringTest extends FlatSpec with TestSparkContext {

val (inputData, indF) = TestFeatureBuilder(Seq(0.0, 2.0, 1.0, 0.0, 0.0, 1.0).map(_.toRealNN))
val labels = Array("a", "c", "b")

val expectedResult: Seq[Text] = Array("a", "b", "c", "a", "a", "c").map(_.toText)

val transformer: OpIndexToString = new OpIndexToString().setInput(indF).setLabels(labels)

Spec[OpIndexToString] should "correctly deindex a numeric column" in {
val strs = transformer.transform(inputData).collect(transformer.getOutput())
strs shouldBe expectedResult
}

it should "correctly deindex a numeric column (shortcut)" in {
val str = indF.deindexed(labels, handleInvalid = IndexToStringHandleInvalid.Error)
val strs = str.originStage.asInstanceOf[OpIndexToString].transform(inputData).collect(str)
strs shouldBe expectedResult
}

it should "getLabels" in {
transformer.getLabels shouldBe labels
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,62 +32,56 @@ package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.unary.UnaryModel
import com.salesforce.op.stages.impl.feature.StringIndexerHandleInvalid.Skip
import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryModel
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.feature.StringIndexerModel
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class OpStringIndexerNoFilterTest extends FlatSpec with TestSparkContext {
class OpStringIndexerNoFilterTest extends
OpEstimatorSpec[RealNN, UnaryModel[Text, RealNN], OpStringIndexerNoFilter[Text]] {

val txtData = Seq("a", "b", "c", "a", "a", "c").map(_.toText)
val (ds, txtF) = TestFeatureBuilder(txtData)
val expected = Array(0.0, 2.0, 1.0, 0.0, 0.0, 1.0).map(_.toRealNN)
val (inputData, txtF) = TestFeatureBuilder(txtData)
override val expectedResult: Seq[RealNN] = Array(0.0, 2.0, 1.0, 0.0, 0.0, 1.0).map(_.toRealNN)

override val estimator: OpStringIndexerNoFilter[Text] = new OpStringIndexerNoFilter[Text]().setInput(txtF)

val txtDataNew = Seq("a", "b", "c", "a", "a", "c", "d", "e").map(_.toText)
val (dsNew, txtFNew ) = TestFeatureBuilder(txtDataNew)
val (dsNew, txtFNew) = TestFeatureBuilder(txtDataNew)
val expectedNew = Array(0.0, 2.0, 1.0, 0.0, 0.0, 1.0, 3.0, 3.0).map(_.toRealNN)


Spec[OpStringIndexerNoFilter[_]] should "correctly index a text column" in {
val stringIndexer = new OpStringIndexerNoFilter[Text]().setInput(txtF)
val indices = stringIndexer.fit(ds).transform(ds).collect(stringIndexer.getOutput())

indices shouldBe expected
}

it should "correctly index a text column (shortcut)" in {
val indexed = txtF.indexed()
val indices = indexed.originStage.asInstanceOf[OpStringIndexerNoFilter[_]].fit(ds).transform(ds).collect(indexed)
indices shouldBe expected
val indices = indexed.originStage.asInstanceOf[OpStringIndexerNoFilter[_]]
.fit(inputData).transform(inputData).collect(indexed)
indices shouldBe expectedResult

val indexed2 = txtF.indexed(handleInvalid = Skip)
val indicesfit = indexed2.originStage.asInstanceOf[OpStringIndexer[_]].fit(ds)
val indices2 = indicesfit.transform(ds).collect(indexed2)
val indicesfit = indexed2.originStage.asInstanceOf[OpStringIndexer[_]].fit(inputData)
val indices2 = indicesfit.transform(inputData).collect(indexed2)
val indices3 = indicesfit.asInstanceOf[SwUnaryModel[Text, RealNN, StringIndexerModel]]
.setInput(txtFNew).transform(dsNew).collect(indexed2)
indices2 shouldBe expected
indices3 shouldBe expected
indices2 shouldBe expectedResult
indices3 shouldBe expectedResult
}

it should "correctly deinxed a numeric column" in {
val indexed = txtF.indexed()
val indices = indexed.originStage.asInstanceOf[OpStringIndexerNoFilter[_]].fit(ds).transform(ds)
val indices = indexed.originStage.asInstanceOf[OpStringIndexerNoFilter[_]].fit(inputData).transform(inputData)
val deindexed = indexed.deindexed()
val deindexedData = deindexed.originStage.asInstanceOf[OpIndexToStringNoFilter]
.transform(indices).collect(deindexed)
deindexedData shouldBe txtData
}

it should "assign new strings to the unseen string category" in {
val stringIndexer = new OpStringIndexerNoFilter[Text]().setInput(txtF)
val indices = stringIndexer.fit(ds).setInput(txtFNew).transform(dsNew).collect(stringIndexer.getOutput())

val indices = estimator.fit(inputData).setInput(txtFNew).transform(dsNew).collect(estimator.getOutput())
indices shouldBe expectedNew
}
}

0 comments on commit ffb9521

Please sign in to comment.