salesforce · mrunfeldt · Sep 30, 2021 · Sep 30, 2021
@@ -37,3 +37,4 @@ docs/_build
 .settings
 */bin
 .vscode
+modelStagingDir/
@@ -128,7 +128,8 @@ Start by picking TransmogrifAI version to match your project dependencies from t
 
 | TransmogrifAI Version                                 | Spark Version | Scala Version | Java Version |
 |-------------------------------------------------------|:-------------:|:-------------:|:------------:|
-| 0.7.1 (unreleased, master), **0.7.0 (stable)**        |    **2.4**    |    **2.11**   |    **1.8**   |
+| 1.0.0 (unreleased, master), **0.7.0 (stable)**        |    **2.4**    |    **2.11**   |    **1.8**   |
+| 0.7.1 (unreleased, master), 0.7.0 (stable)            |      2.4      |      2.11     |      1.8     |
 | 0.6.1, 0.6.0, 0.5.3, 0.5.2, 0.5.1, 0.5.0              |      2.3      |      2.11     |      1.8     |
 | 0.4.0, 0.3.4                                          |      2.2      |      2.11     |      1.8     |
 

@@ -627,6 +627,94 @@ trait RichMapFeature {
     }
   }
 
+  /**
+   * Enrichment functions for OPMap Features with Long values
+   *
+   * @param f FeatureLike
+   */
+  implicit class RichIntegerMapFeature[T <: OPMap[Int] : TypeTag](val f: FeatureLike[T])
+    (implicit val ttiv: TypeTag[T#Value]) {
+
+    /**
+     * Apply a smart bucketizer transformer
+     *
+     * @param label         label feature
+     * @param trackNulls    option to keep track of values that were missing
+     * @param trackInvalid  option to keep track of invalid values,
+     *                      eg. NaN, -/+Inf or values that fall outside the buckets
+     * @param minInfoGain   minimum info gain, one of the stopping criteria of the Decision Tree
+     * @param cleanKeys     clean text before pivoting
+     * @param allowListKeys keys to allowlist
+     * @param blockListKeys keys to blocklist
+     */
+    def autoBucketize(
+       label: FeatureLike[RealNN],
+       trackNulls: Boolean,
+       trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid,
+       minInfoGain: Double = DecisionTreeNumericBucketizer.MinInfoGain,
+       cleanKeys: Boolean = TransmogrifierDefaults.CleanKeys,
+       allowListKeys: Array[String] = Array.empty,
+       blockListKeys: Array[String] = Array.empty
+     ): FeatureLike[OPVector] = {
+      new DecisionTreeNumericMapBucketizer[Int, T]()
+        .setInput(label, f)
+        .setTrackInvalid(trackInvalid)
+        .setTrackNulls(trackNulls)
+        .setMinInfoGain(minInfoGain)
+        .setCleanKeys(cleanKeys)
+        .setAllowListKeys(allowListKeys)
+        .setBlockListKeys(blockListKeys).getOutput()
+    }
+
+
+    /**
+     * Apply IntegerMapVectorizer or auto bucketizer (when label is present) on any OPMap that has int values
+     *
+     * @param others        other features of the same type
+     * @param defaultValue  value to give missing keys on pivot
+     * @param cleanKeys     clean text before pivoting
+     * @param allowListKeys keys to allowlist
+     * @param blockListKeys keys to blocklist
+     * @param trackNulls    option to keep track of values that were missing
+     * @param label         optional label column to be passed into autoBucketizer if present
+     * @param trackInvalid  option to keep track of invalid values,
+     *                      eg. NaN, -/+Inf or values that fall outside the buckets
+     * @param minInfoGain   minimum info gain, one of the stopping criteria of the Decision Tree
+     * @return an OPVector feature
+     */
+    def vectorize(
+     defaultValue: Int,
+     fillWithMean: Boolean = TransmogrifierDefaults.FillWithMean,
+     cleanKeys: Boolean = TransmogrifierDefaults.CleanKeys,
+     allowListKeys: Array[String] = Array.empty,
+     blockListKeys: Array[String] = Array.empty,
+     others: Array[FeatureLike[T]] = Array.empty,
+     trackNulls: Boolean = TransmogrifierDefaults.TrackNulls,
+     trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid,
+     minInfoGain: Double = TransmogrifierDefaults.MinInfoGain,
+     label: Option[FeatureLike[RealNN]] = None
+   ): FeatureLike[OPVector] = {
+      label match {
+        case None =>
+          new IntegerMapVectorizer[T]()
+            .setInput(f +: others)
+            .setFillWithMean(fillWithMean)
+            .setDefaultValue(defaultValue)
+            .setCleanKeys(cleanKeys)
+            .setAllowListKeys(allowListKeys)
+            .setBlockListKeys(blockListKeys)
+            .setTrackNulls(trackNulls)
+            .getOutput()
+        case Some(lbl) =>
+          autoBucketize(
+            label = lbl, trackNulls = trackNulls, trackInvalid = trackInvalid,
+            minInfoGain = minInfoGain, cleanKeys = cleanKeys,
+            allowListKeys = allowListKeys, blockListKeys = blockListKeys
+          )
+      }
+    }
+  }
+
   /**
    * Enrichment functions for OPMap Features with Long values
    *

@@ -457,7 +457,7 @@ trait RichNumericFeature {
      * @param minRequiredRuleSupport  Categoricals can be removed if an association rule is found between one of the
      *                                choices and a categorical label where the confidence of that rule is above
      *                                maxRuleConfidence and the support fraction of that choice is above minRuleSupport.
-     * @param featureLabelCorrOnly    If true, then only calculate correlations between features and label instead of
+     * @param featureFeatureCorrLevel    If true, then only calculate correlations between features and label instead of
      *                                the entire correlation matrix which includes all feature-feature correlations
      * @param correlationExclusion    Setting for what categories of feature vector columns to exclude from the
      *                                correlation calculation (eg. hashed text features)
@@ -668,4 +668,84 @@ trait RichNumericFeature {
     }
   }
 
+  /**
+   * Enrichment functions for Integer Feature
+   *
+   * @param f FeatureLike
+   */
+  implicit class RichIntegerFeature[T <: Integer : TypeTag](val f: FeatureLike[T])
+    (implicit val ttiv: TypeTag[T#Value]) {
+
+    /**
+     * Fill missing values with mean
+     *
+     * @param default default value is the whole feature is filled with missing values
+     * @return transformed feature of type RealNN
+     */
+    def fillMissingWithMean(default: Double = 0.0): FeatureLike[RealNN] = {
+      f.transformWith(new FillMissingWithMean[Int, T]().setDefaultValue(default))
+    }
+
+    /**
+     * Apply a smart bucketizer transformer
+     *
+     * @param label        label feature
+     * @param trackNulls   option to keep track of values that were missing
+     * @param trackInvalid option to keep track of invalid values,
+     *                     eg. NaN, -/+Inf or values that fall outside the buckets
+     * @param minInfoGain  minimum info gain, one of the stopping criteria of the Decision Tree
+     */
+    def autoBucketize(
+      label: FeatureLike[RealNN],
+      trackNulls: Boolean,
+      trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid,
+      minInfoGain: Double = DecisionTreeNumericBucketizer.MinInfoGain
+    ): FeatureLike[OPVector] = {
+      new DecisionTreeNumericBucketizer[Int, T]()
+        .setInput(label, f)
+        .setTrackInvalid(trackInvalid)
+        .setTrackNulls(trackNulls)
+        .setMinInfoGain(minInfoGain).getOutput()
+    }
+
+    /**
+     * Apply integer vectorizer: Converts a sequence of Integer features into a vector feature.
+     *
+     * @param others       other features of same type
+     * @param fillValue    value to pull in place of nulls
+     * @param trackNulls   keep tract of when nulls occur by adding a second column to the vector with a null indicator
+     * @param fillWithMode replace missing values with mode (as apposed to constant provided in fillValue)
+     * @param trackInvalid option to keep track of invalid values,
+     *                     eg. NaN, -/+Inf or values that fall outside the buckets
+     * @param minInfoGain  minimum info gain, one of the stopping criteria of the Decision Tree for the autoBucketizer
+     * @param label        optional label column to be passed into autoBucketizer if present
+     * @return             a vector feature containing the raw Features with filled missing values and the bucketized
+     *                     features if a label argument is passed
+     */
+    def vectorize
+    (
+      fillValue: Int,
+      fillWithMode: Boolean,
+      trackNulls: Boolean,
+      others: Array[FeatureLike[T]] = Array.empty,
+      trackInvalid: Boolean = TransmogrifierDefaults.TrackInvalid,
+      minInfoGain: Double = TransmogrifierDefaults.MinInfoGain,
+      label: Option[FeatureLike[RealNN]] = None
+    ): FeatureLike[OPVector] = {
+      val features = f +: others
+      val stage = new IntegerVectorizer[T]().setInput(features).setTrackNulls(trackNulls)
+      if (fillWithMode) stage.setFillWithMode else stage.setFillWithConstant(fillValue)
+      val filledValues = stage.getOutput()
+      label match {
+        case None =>
+          filledValues
+        case Some(lbl) =>
+          val bucketized = features.map(
+            _.autoBucketize(label = lbl, trackNulls = false, trackInvalid = trackInvalid, minInfoGain = minInfoGain)
+          )
+          new VectorsCombiner().setInput(filledValues +: bucketized).getOutput()
+      }
+    }
+  }
+
 }
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op.UID
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.base.sequence.{SequenceEstimator, SequenceModel}
+import com.salesforce.op.utils.spark.SequenceAggregators
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.param.{BooleanParam, IntParam}
+import org.apache.spark.sql.Dataset
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Converts a sequence of Integer features into a vector feature.
+ * Can choose to fill null values with the mean or a constant
+ *
+ * @param uid uid for instance
+ */
+class IntegerVectorizer[T <: Integer]
+(
+  uid: String = UID[IntegerVectorizer[_]],
+  operationName: String = "vecInteger"
+) (implicit tti: TypeTag[T], ttiv: TypeTag[T#Value])
+  extends SequenceEstimator[T, OPVector](operationName = operationName, uid = uid)
+    with VectorizerDefaults with TrackNullsParam {
+
+  final val fillValue = new IntParam(this, "fillValue", "default value for FillWithConstant")
+  setDefault(fillValue, 0)
+
+  final val withConstant = new BooleanParam(this, "fillWithConstant",
+    "boolean to check if filling the nulls with a constant value")
+  setDefault(withConstant, true)
+
+  def setFillWithConstant(value: Int): this.type = {
+    set(fillValue, value)
+    set(withConstant, true)
+  }
+  def setFillWithMode: this.type = set(withConstant, false)
+
+  private def constants(): Seq[Int] = {
+    val size = getInputFeatures().length
+    val defValue = $(fillValue)
+    val constants = List.fill(size)(defValue)
+    constants
+  }
+
+  private def mode(dataset: Dataset[Seq[T#Value]]): Seq[Int] = {
+    val size = getInputFeatures().length
+    dataset.select(SequenceAggregators.MeanSeqNullInteger(size = size).toColumn).first()
+  }
+
+  def fitFn(dataset: Dataset[Seq[T#Value]]): SequenceModel[T, OPVector] = {
+    if ($(trackNulls)) setMetadata(vectorMetadataWithNullIndicators.toMetadata)
+
+    val fillValues = if ($(withConstant)) constants() else mode(dataset)
+
+    new IntegerVectorizerModel[T](
+      fillValues = fillValues, trackNulls = $(trackNulls), operationName = operationName, uid = uid)
+
+  }
+
+}
+
+final class IntegerVectorizerModel[T <: Integer] private[op]
+(
+  val fillValues: Seq[Int],
+  val trackNulls: Boolean,
+  operationName: String,
+  uid: String
+)(implicit tti: TypeTag[T])
+  extends SequenceModel[T, OPVector](operationName = operationName, uid = uid)
+    with VectorizerDefaults {
+
+  def transformFn: Seq[T] => OPVector = row => {
+    val replaced = if (!trackNulls) {
+      row.zip(fillValues).
+        map { case (i, m) => i.value.getOrElse(m).toDouble }
+    }
+    else {
+      row.zip(fillValues).
+        flatMap { case (i, m) => i.value.getOrElse(m).toDouble :: booleanToDouble(i.value.isEmpty) :: Nil }
+    }
+    Vectors.dense(replaced.toArray).toOPVector
+  }
+
+}
@@ -110,6 +110,35 @@ class BinaryMapVectorizer[T <: OPMap[Boolean]](uid: String = UID[BinaryMapVector
     new BinaryMapVectorizerModel(args, operationName = operationName, uid = uid)
 }
 
+/**
+ * Class for vectorizing IntegralMap features. Fills missing keys with the mode for that key.
+ *
+ * @param uid uid for instance
+ * @param tti type tag for input
+ * @tparam T input feature type to vectorize into an OPVector
+ */
+class IntegerMapVectorizer[T <: OPMap[Int]](uid: String = UID[IntegerMapVectorizer[T]])(implicit tti: TypeTag[T])
+  extends OPMapVectorizer[Int, T](uid = uid, operationName = "vecIntegerMap", convertFn = integerMapToRealMap) {
+
+  def setFillWithMean(shouldFill: Boolean): this.type = set(withConstant, !shouldFill)
+
+  override def fillByKey(dataset: Dataset[Seq[T#Value]]): Seq[Map[String, Double]] = {
+    if ($(withConstant)) Seq.empty
+    else {
+      val size = getInputFeatures().length
+      val meanAggr = SequenceAggregators.MeanSeqMapInteger(size = size)
+      val shouldCleanKeys = $(cleanKeys)
+      val cleanedData = dataset.map(_.map(
+        cleanMap(_, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues)
+      ))
+      cleanedData.select(meanAggr.toColumn).first()
+      }.map(convertFn)
+  }
+
+  def makeModel(args: OPMapVectorizerModelArgs, operationName: String, uid: String): OPMapVectorizerModel[Int, T] =
+    new IntegerMapVectorizerModel(args, operationName = operationName, uid = uid)
+}
+
 /**
  * Class for vectorizing IntegralMap features. Fills missing keys with the mode for that key.
  *
@@ -377,6 +406,16 @@ final class BinaryMapVectorizerModel[T <: OPMap[Boolean]] private[op]
   def convertFn: Map[String, Boolean] => Map[String, Double] = booleanToRealMap
 }
 
+final class IntegerMapVectorizerModel[T <: OPMap[Int]] private[op]
+(
+  args: OPMapVectorizerModelArgs,
+  operationName: String,
+  uid: String
+)(implicit tti: TypeTag[T])
+  extends OPMapVectorizerModel[Int, T](args = args, operationName = operationName, uid = uid) {
+  def convertFn: Map[String, Int] => Map[String, Double] = integerMapToRealMap
+}
+
 final class IntegralMapVectorizerModel[T <: OPMap[Long]] private[op]
 (
   args: OPMapVectorizerModelArgs,
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,3 +37,4 @@ docs/_build @@
     .settings
     */bin
     .vscode
+    modelStagingDir/