-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Change error for Warning, add assertion in RowUtils, rollback test ch…
…anges
- Loading branch information
Showing
13 changed files
with
294 additions
and
149 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
111 changes: 111 additions & 0 deletions
111
src/test/scala/io/qbeast/spark/index/RowUtilsTest.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
package io.qbeast.spark.index | ||
|
||
import io.qbeast.core.model.DoubleDataType | ||
import io.qbeast.core.model.Point | ||
import io.qbeast.core.model.QTableID | ||
import io.qbeast.core.model.Revision | ||
import io.qbeast.core.model.StringDataType | ||
import io.qbeast.core.transform.CDFNumericQuantilesTransformation | ||
import io.qbeast.core.transform.CDFQuantilesTransformer | ||
import io.qbeast.core.transform.HashTransformation | ||
import io.qbeast.core.transform.HashTransformer | ||
import io.qbeast.core.transform.LinearTransformation | ||
import io.qbeast.core.transform.LinearTransformer | ||
import io.qbeast.QbeastIntegrationTestSpec | ||
import org.apache.spark.sql.AnalysisException | ||
|
||
class RowUtilsTest extends QbeastIntegrationTestSpec { | ||
|
||
"RowUtils" should "transform a row using LinearTransformation" in withQbeastContextSparkAndTmpWarehouse { | ||
(spark, _) => | ||
import spark.implicits._ | ||
val df = Seq((1.0, "a")).toDF("id", "name") | ||
val revision = Revision( | ||
12L, | ||
12L, | ||
QTableID("test"), | ||
100, | ||
Vector(LinearTransformer("id", DoubleDataType)), | ||
List(LinearTransformation(0.0, 10.0, 5.0, DoubleDataType))) | ||
val row = df.head | ||
val transformedRow = RowUtils.rowValuesToPoint(row, revision) | ||
transformedRow shouldBe Point(Vector(0.1)) | ||
|
||
} | ||
|
||
it should "transform a row using HashTransformation" in withQbeastContextSparkAndTmpWarehouse { | ||
(spark, _) => | ||
import spark.implicits._ | ||
val df = Seq(("1", "a")).toDF("id", "name") | ||
val revision = Revision( | ||
12L, | ||
12L, | ||
QTableID("test"), | ||
100, | ||
Vector(HashTransformer("id", StringDataType)), | ||
List(HashTransformation("null"))) | ||
val row = df.head | ||
val transformedRow = RowUtils.rowValuesToPoint(row, revision) | ||
transformedRow shouldBe Point(Vector(0.24913018394686756)) | ||
|
||
} | ||
|
||
it should "transform a row using quantiles" in withQbeastContextSparkAndTmpWarehouse { | ||
(spark, _) => | ||
import spark.implicits._ | ||
val df = Seq((1.0, "a")).toDF("id", "name") | ||
val revision = Revision( | ||
12L, | ||
12L, | ||
QTableID("test"), | ||
100, | ||
Vector(CDFQuantilesTransformer("id", DoubleDataType)), | ||
List(CDFNumericQuantilesTransformation(Array(0.0, 2.0), DoubleDataType))) | ||
val row = df.head | ||
val transformedRow = RowUtils.rowValuesToPoint(row, revision) | ||
transformedRow shouldBe Point(Vector(0.5)) | ||
} | ||
|
||
it should "throw an error when values are out of max bound" in withQbeastContextSparkAndTmpWarehouse { | ||
(spark, _) => | ||
import spark.implicits._ | ||
val df = Seq((20.0, "a")).toDF("id", "name") | ||
val revision = Revision( | ||
12L, | ||
12L, | ||
QTableID("test"), | ||
100, | ||
Vector(LinearTransformer("id", DoubleDataType)), | ||
List(LinearTransformation(0.0, 10.0, 5.0, DoubleDataType))) | ||
val row = df.head | ||
an[AssertionError] shouldBe thrownBy(RowUtils.rowValuesToPoint(row, revision)) | ||
|
||
} | ||
|
||
it should "throw an error when values are out of min bound" in withQbeastContextSparkAndTmpWarehouse { | ||
(spark, _) => | ||
import spark.implicits._ | ||
val df = Seq((-1.0, "a")).toDF("id", "name") | ||
val revision = Revision( | ||
12L, | ||
12L, | ||
QTableID("test"), | ||
100, | ||
Vector(LinearTransformer("id", DoubleDataType)), | ||
List(LinearTransformation(0.0, 10.0, 5.0, DoubleDataType))) | ||
val row = df.head | ||
an[AssertionError] shouldBe thrownBy(RowUtils.rowValuesToPoint(row, revision)) | ||
|
||
} | ||
|
||
it should "throw an error when Transformations are empty" in withQbeastContextSparkAndTmpWarehouse { | ||
(spark, _) => | ||
import spark.implicits._ | ||
val df = Seq((-1.0, "a")).toDF("id", "name") | ||
val revision = Revision(12L, 12L, QTableID("test"), 100, Vector.empty, List.empty) | ||
val row = df.head | ||
an[AnalysisException] shouldBe thrownBy(RowUtils.rowValuesToPoint(row, revision)) | ||
|
||
} | ||
|
||
} |
99 changes: 99 additions & 0 deletions
99
src/test/scala/io/qbeast/spark/index/SparkPlanAnalyzerTest.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
package io.qbeast.spark.index | ||
|
||
import io.qbeast.QbeastIntegrationTestSpec | ||
|
||
class SparkPlanAnalyzerTest extends QbeastIntegrationTestSpec { | ||
|
||
object SparkPlanAnalyzerTesting extends SparkPlanAnalyzer | ||
|
||
private lazy val nonDeterministicUDF = org.apache.spark.sql.functions | ||
.udf(() => { | ||
scala.util.Random.nextInt() | ||
}) | ||
.asNondeterministic() | ||
|
||
"SparkPlanAnalyzer" should "detect underterminism in LIMIT" in withSparkAndTmpDir { | ||
(spark, _) => | ||
val df = spark | ||
.range(10) | ||
.toDF("id") | ||
.limit(5) | ||
val columnsToAnalyze = Seq("id") | ||
val isDeterministic = | ||
SparkPlanAnalyzerTesting.analyzeDataFrameDeterminism(df, columnsToAnalyze) | ||
isDeterministic shouldBe false | ||
} | ||
|
||
it should "detect undeterminism in SAMPLE" in withSparkAndTmpDir { (spark, _) => | ||
val df = spark | ||
.range(10) | ||
.toDF("id") | ||
.sample(0.1) | ||
val columnsToAnalyze = Seq("id") | ||
val isDeterministic = | ||
SparkPlanAnalyzerTesting.analyzeDataFrameDeterminism(df, columnsToAnalyze) | ||
isDeterministic shouldBe false | ||
} | ||
|
||
it should "detect non-determinism in non-deterministic columns" in withSparkAndTmpDir { | ||
(spark, _) => | ||
val df = spark | ||
.range(10) | ||
.withColumn("non_deterministic_col", nonDeterministicUDF()) | ||
|
||
val columnsToAnalyze = Seq("non_deterministic_col") | ||
val isDeterministic = | ||
SparkPlanAnalyzerTesting.analyzeDataFrameDeterminism(df, columnsToAnalyze) | ||
isDeterministic shouldBe false | ||
} | ||
|
||
it should "detect non-deterministic query filters" in withSparkAndTmpDir { (spark, tmpDir) => | ||
val df = spark | ||
.range(10) | ||
.toDF("id") | ||
.filter( | ||
nonDeterministicUDF() > 5 | ||
) // The filter contains non-deterministic predicates that can affect the results when executed multiple times | ||
|
||
val columnsToAnalyze = Seq("id") | ||
val isDeterministic = | ||
SparkPlanAnalyzerTesting.analyzeDataFrameDeterminism(df, columnsToAnalyze) | ||
isDeterministic shouldBe false | ||
|
||
} | ||
|
||
it should "return true if no columnsToAnalyze are provided" in withSparkAndTmpDir { | ||
(spark, tmpDir) => | ||
val df = spark.range(10).toDF("id") | ||
val isDeterministic = SparkPlanAnalyzerTesting.analyzeDataFrameDeterminism(df, Seq.empty) | ||
isDeterministic shouldBe true | ||
} | ||
|
||
it should "return true if all columns are deterministic" in withSparkAndTmpDir { | ||
(spark, tmpDir) => | ||
val df = spark.range(10).toDF("id") | ||
val columnsToAnalyze = Seq("id") | ||
val isDeterministic = | ||
SparkPlanAnalyzerTesting.analyzeDataFrameDeterminism(df, columnsToAnalyze) | ||
isDeterministic shouldBe true | ||
} | ||
|
||
it should "return true if filters are deterministic" in withSparkAndTmpDir { (spark, tmpDir) => | ||
val df = spark.range(10).toDF("id").filter("id > 5") | ||
val isDeterministic = SparkPlanAnalyzerTesting.analyzeDataFrameDeterminism(df, Seq("id")) | ||
isDeterministic shouldBe true | ||
} | ||
|
||
it should "mark a Qbeast Sample as deterministic" in withSparkAndTmpDir { (spark, tmpDir) => | ||
val qbeastDir = tmpDir + "/qbeast" | ||
val df = spark.range(10).toDF("id") | ||
df.write.format("qbeast").option("columnsToIndex", "id").save(qbeastDir) | ||
|
||
val qbeastDF = spark.read.format("qbeast").load(qbeastDir) | ||
val sampleDF = qbeastDF.sample(0.5) | ||
val isDeterministic = | ||
SparkPlanAnalyzerTesting.analyzeDataFrameDeterminism(sampleDF, Seq("id")) | ||
isDeterministic shouldBe true | ||
} | ||
|
||
} |
Oops, something went wrong.