add test

awslabs · Oct 31, 2023 · 73bbf0d · 73bbf0d
1 parent 87df8ad
commit 73bbf0d
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 1 deletion.
diff --git a/...cessing/data_transformations/dist_transformations/dist_bucket_numerical_transformation.py b/...cessing/data_transformations/dist_transformations/dist_bucket_numerical_transformation.py
@@ -44,7 +44,7 @@ def get_transformation_name() -> str:
         return "DistBucketNumericalTransformation"
 
     def apply(self, input_df: DataFrame) -> DataFrame:
-        imputed_df = apply_imputation(self.cols,self.shared_imputation, input_df)
+        imputed_df = apply_imputation(self.cols, self.shared_imputation, input_df)
         scaled_df = apply_norm(self.cols, self.shared_norm, imputed_df)
         min_val, max_val = self.range
 

diff --git a/graphstorm-processing/tests/test_dist_bucket_transformation.py b/graphstorm-processing/tests/test_dist_bucket_transformation.py
@@ -0,0 +1,86 @@
+"""
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License").
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from pyspark.sql import DataFrame, SparkSession
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from graphstorm_processing.data_transformations.dist_transformations import DistBucketNumericalTransformation
+
+
+def test_bucket_numerical_without_missing(user_df: DataFrame):
+    bucket_transformation = DistBucketNumericalTransformation(
+        ["age"], [22, 33], 3, 0, "none", "none")
+
+    output_df = bucket_transformation.apply(user_df)
+
+    assert output_df.select("age").distinct().count() == 3
+
+def test_bucket_numerical_example(spark: SparkSession, check_df_schema):
+    data = [("mark",0.,None),("john",15.,10000),
+            ("tara",26.,20000),("jen",40.,10000)]
+
+    columns = ["name","age", "salary"]
+    input_df = spark.createDataFrame(data, schema=columns)
+
+    low = 10.
+    high = 30.
+    bucket_cnt = 4
+    window_size = 10. # range is 10 ~ 15; 15 ~ 20; 20 ~ 25; 25 ~ 30
+
+    bucket_transformation = DistBucketNumericalTransformation(
+        ["age"], [low, high], bucket_cnt, window_size, "none", "none")
+
+    output_df = bucket_transformation.apply(input_df)
+
+    check_df_schema(output_df)
+
+    out_rows = output_df.collect()
+
+    expected_vals = np.array([
+        [1., 0., 0., 0],
+        [1., 1., 1., 0.],
+        [0., 0., 1., 1.],
+        [0., 0., 0., 1.]])
+
+    for i, row in enumerate(out_rows):
+        assert_array_equal(row["age"], expected_vals[i, :], err_msg=f"Row {i} is not equal")
+
+def test_bucket_numerical_second_example(spark: SparkSession):
+    data = [("john",21.,None),("tim",31.,10000),
+            ("maggie",55.,20000)]
+
+    columns = ["name","age", "salary"]
+    input_df = spark.createDataFrame(data, schema=columns)
+
+    low = 0.0
+    high = 100.0
+    bucket_cnt = 10
+    window_size = 5.0
+
+    bucket_transformation = DistBucketNumericalTransformation(
+        ["age"], [low, high], bucket_cnt, window_size, "none", "none")
+
+    output_df = bucket_transformation.apply(input_df)
+
+    out_rows = output_df.collect()
+
+    expected_vals = np.array([
+        [0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype=float)
+
+    for i, row in enumerate(out_rows):
+        assert_array_equal(row["age"], expected_vals[i, :], err_msg=f"Row {i} is not equal")