Merge branch 'main' into gsprocessing-hard-negative

awslabs · Nov 14, 2024 · 11007eb · 11007eb
2 parents 35d4cbd + f5cf632
commit 11007eb
Show file tree

Hide file tree

Showing 13 changed files with 675 additions and 108 deletions.
diff --git a/docs/source/cli/graph-construction/distributed/example.rst b/docs/source/cli/graph-construction/distributed/example.rst
@@ -259,7 +259,9 @@ the graph structure, features, and labels. In more detail:
   GSProcessing will use the transformation values listed here
   instead of creating new ones, ensuring that models trained with the original
   data can still be used in the newly transformed data. Currently only
-  categorical transformations can be re-applied.
+  categorical and numerical transformations can be re-applied. Note that
+  the Rank-Gauss transformation does not support re-application, it may
+  only work for transductive tasks.
 * ``updated_row_counts_metadata.json``:
   This file is meant to be used as the input configuration for the
   distributed partitioning pipeline. ``gs-repartition`` produces
@@ -313,7 +315,7 @@ you can use the following command to run the partition job locally:
         --num-parts 2 \
         --dgl-tool-path ./dgl/tools \
         --partition-algorithm random \
-        --ip-config ip_list.txt 
+        --ip-config ip_list.txt
 
 The command above will first do graph partitioning to determine the ownership for each partition and save the results.
 Then it will do data dispatching to physically assign the partitions to graph data and dispatch them to each machine.

diff --git a/docs/source/cli/model-training-inference/distributed/sagemaker.rst b/docs/source/cli/model-training-inference/distributed/sagemaker.rst
@@ -212,6 +212,27 @@ Users can use the following commands to check the corresponding outputs:
     aws s3 ls s3://<PATH_TO_SAVE_GENERATED_NODE_EMBEDDING>/
     aws s3 ls s3://<PATH_TO_SAVE_PREDICTION_RESULTS>/
 
+Launch embedding generation task
+``````````````````````````````````
+Users can use the following example command to launch a GraphStorm embedding generation job in the ``ogbn-mag`` data without generating predictions.
+
+.. code:: bash
+    python3 launch/launch_infer.py  \
+            --image-url <AMAZON_ECR_IMAGE_URI> \
+            --region <REGION> \
+            --entry-point run/infer_entry.py \
+            --role <ROLE_ARN> \
+            --instance-count 3 \
+            --graph-data-s3 s3://<PATH_TO_DATA>/ogbn_mag_lp_3p \
+            --yaml-s3 s3://<PATH_TO_TRAINING_CONFIG>/mag_lp.yaml \
+            --model-artifact-s3 s3://<PATH_TO_SAVE_TRAINED_MODEL>/ \
+            --raw-node-mappings-s3 s3://<PATH_TO_DATA>/ogbn_mag_lp_3p/raw_id_mappings \
+            --task-type compute_emb \
+            --output-emb-s3 s3://<PATH_TO_SAVE_GENERATED_NODE_EMBEDDING>/ \
+            --graph-name ogbn-mag \
+            --restore-model-layers embed,gnn
+
+
 Launch graph partitioning task
 ```````````````````````````````
 If your data are in the `DGL chunked

diff --git a/graphstorm-processing/graphstorm_processing/config/data_config_base.py b/graphstorm-processing/graphstorm_processing/config/data_config_base.py
@@ -15,7 +15,7 @@
 """
 
 from dataclasses import dataclass
-from typing import Sequence, Optional
+from typing import Optional
 
 from graphstorm_processing.constants import SUPPORTED_FILE_TYPES
 
@@ -27,7 +27,7 @@ class DataStorageConfig:
     """
 
     format: str
-    files: Sequence[str]
+    files: list[str]
     separator: Optional[str] = None
 
     def __post_init__(self):
@@ -39,3 +39,7 @@ def __post_init__(self):
                     raise ValueError(
                         f"File paths need to be relative (not starting with '/'), got : {file}"
                     )
+
+        for idx, file in enumerate(self.files):
+            if file.startswith("./"):
+                self.files[idx] = file[2:]
diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_feature_transformer.py
@@ -32,7 +32,7 @@
 )
 
 
-class DistFeatureTransformer(object):
+class DistFeatureTransformer:
     """
     Given a feature configuration selects the correct transformation type,
     which can then be be applied through a call to apply_transformation.
@@ -57,7 +57,9 @@ def __init__(
         if feat_type == "no-op":
             self.transformation = NoopTransformation(**default_kwargs, **args_dict)
         elif feat_type == "numerical":
-            self.transformation = DistNumericalTransformation(**default_kwargs, **args_dict)
+            self.transformation = DistNumericalTransformation(
+                **default_kwargs, **args_dict, json_representation=json_representation
+            )
         elif feat_type == "multi-numerical":
             self.transformation = DistMultiNumericalTransformation(**default_kwargs, **args_dict)
         elif feat_type == "bucket-numerical":

diff --git a/...cessing/data_transformations/dist_transformations/dist_bucket_numerical_transformation.py b/...cessing/data_transformations/dist_transformations/dist_bucket_numerical_transformation.py
@@ -67,7 +67,7 @@ def get_transformation_name() -> str:
         return "DistBucketNumericalTransformation"
 
     def apply(self, input_df: DataFrame) -> DataFrame:
-        imputed_df = apply_imputation(self.cols, self.shared_imputation, input_df)
+        imputed_df = apply_imputation(self.cols, self.shared_imputation, input_df).imputed_df
         # TODO: Make range optional by getting min/max from data.
         min_val, max_val = self.range