Add tensorboard to display training and evaluation metrics and revise implementation to support DLRMv2 (#3163)

lizhouyu · facebook-github-bot · commit 20ce4f0e46fc · 2025-07-09T10:57:49.000-07:00
Summary: Pull Request resolved: #3163 ### Major changes - Add tensorboard to the benchmark testbed, specifically in `benchmark_zch.py`. - Count the number of unique values received by each rank in each epoch by revising `benchmark_zch_utils.py`. - Revise `data/non_zch_remapper.py` to not depend on `batch.to_dict()` method, instead it fetch dataclass `batch`'s attribute with the built-in `vars()` method. - Revise DLRMv2 model EBC config initialization to make the table name identical with the feature name. - Revise DLRMv2 configuration yaml file to set table size for each feature. - Revise the default value for "num_embeddings" parameter in `arguments.py` to None. Rollback Plan: Differential Revision: D77841795
diff --git a/torchrec/distributed/benchmark/benchmark_zch/arguments.py b/torchrec/distributed/benchmark/benchmark_zch/arguments.py
@@ -25,7 +25,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
     parser.add_argument(
         "--num_embeddings",  # ratio of feature ids to embedding table size # 3 axis: x-bath_idx; y-collisions; zembedding table sizes
         type=int,
-        default=100_000,
+        default=None,
         help="max_ind_size. The number of embeddings in each embedding table. Defaults"
         " to 100_000 if num_embeddings_per_feature is not supplied.",
     )
diff --git a/torchrec/distributed/benchmark/benchmark_zch/benchmark_zch.py b/torchrec/distributed/benchmark/benchmark_zch/benchmark_zch.py
diff --git a/torchrec/distributed/benchmark/benchmark_zch/benchmark_zch_utils.py b/torchrec/distributed/benchmark/benchmark_zch/benchmark_zch_utils.py
@@ -1,23 +1,13 @@
-import argparse
-import copy
 import json
 import logging
 import os
-from typing import Any, Dict
+from typing import Any, Dict, Set
 
 import numpy as np
 
 import torch
 import torch.nn as nn
-import yaml
 from torchrec.modules.mc_embedding_modules import ManagedCollisionEmbeddingCollection
-from torchrec.modules.mc_modules import (
-    DistanceLFU_EvictionPolicy,
-    ManagedCollisionCollection,
-    MCHManagedCollisionModule,
-)
-
-from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
 
 
 def get_module_from_instance(
@@ -104,6 +94,7 @@ def __init__(
         self._mch_stats: Dict[str, Any] = (
             {}
         )  # dictionary of {table_name [str]: {metric_name [str]: metric_value [int]}}
+        self.feature_name_unique_queried_values_set_dict: Dict[str, Set[int]] = {}
 
     # record mcec state to file
     def record_mcec_state(self, stage: str) -> None:
@@ -260,6 +251,7 @@ def update(self) -> None:
                     "collision_cnt": 0,
                     "rank_total_cnt": 0,
                     "num_empty_slots": 0,
+                    "num_unique_queries": 0,
                 }
             # get the input faeture values
             input_feature_values = np.array(rank_feature_value_before_fwd[feature_name])
@@ -313,4 +305,16 @@ def update(self) -> None:
                 this_rank_total_count - this_rank_hits_count - this_rank_insert_count
             )
             batch_stats[feature_name]["collision_cnt"] += int(this_rank_collision_count)
+            # get the unique values in the input feature values
+            if feature_name not in self.feature_name_unique_queried_values_set_dict:
+                self.feature_name_unique_queried_values_set_dict[feature_name] = set(
+                    input_feature_values.tolist()
+                )
+            else:
+                self.feature_name_unique_queried_values_set_dict[feature_name].update(
+                    set(input_feature_values.tolist())
+                )
+            batch_stats[feature_name]["num_unique_queries"] = len(
+                self.feature_name_unique_queried_values_set_dict[feature_name]
+            )
         self._mch_stats = batch_stats
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/configs/criteo_kaggle.yaml b/torchrec/distributed/benchmark/benchmark_zch/data/configs/criteo_kaggle.yaml
@@ -1,3 +1,7 @@
-dataset_path: "/home/lizhouyu/oss_github/dlrm/torchrec_dlrm/criteo_1tb/criteo_kaggle_processed"
+dataset_path: "/home/lizhouyu/datasets/criteo_kaggle_processed"
 batch_size: 4096
 seed: 0
+multitask_configs:
+  - task_name: is_click
+    task_weight: 1
+    task_type: classification
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/configs/kuairand_1k.yaml b/torchrec/distributed/benchmark/benchmark_zch/data/configs/kuairand_1k.yaml
@@ -1,4 +1,4 @@
-dataset_path: "/home/lizhouyu/oss_github/generative-recommenders/generative_recommenders/dlrm_v3/data/KuaiRand-1K/data"
+dataset_path: "/home/lizhouyu/datasets/kuairand-1k/data"
 batch_size: 16
 train_split_percentage: 0.75
 num_workers: 4
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/nonzch_remapper.py b/torchrec/distributed/benchmark/benchmark_zch/data/nonzch_remapper.py
@@ -99,6 +99,24 @@ def __init__(
             )
         self._input_hash_size = input_hash_size
 
+    def get_batch_kjt_dict(self, batch: Batch) -> Dict[str, KeyedJaggedTensor]:
+        """
+        Get the KJT in each batch
+        Parameters:
+            batch: the batch whose KJT is ought to be fetched
+        Returns:
+            batch_kjt_dict: a dictionary of [batch_attribute_name: KeyedJaggedTensor]
+            where only attributes whose values are KeyedJaggedTensor are fetched.
+        """
+        batch_kjt_dict = {}  # create a dictionary for return
+        batch_attr_dict = vars(batch)  # get batch's attributes and values
+        for batch_attr_name, batch_attr_value in batch_attr_dict.items():
+            if isinstance(
+                batch_attr_value, KeyedJaggedTensor
+            ):  # only fetch attributes whose values are KeyedJaggedTensor
+                batch_kjt_dict[batch_attr_name] = batch_attr_value
+        return batch_kjt_dict
+
     def remap(self, batch: Batch) -> Batch:
         # for all the attributes under batch, like batch.uih_features, batch.candidates_features,
         # get the kjt as a dict, and remap the kjt
@@ -118,7 +136,7 @@ def remap(self, batch: Batch) -> Batch:
         #     candidates_features: KeyedJaggedTensor
 
         # for every attribute in batch, remap the kjt
-        for attr_name, feature_kjt_dict in batch.get_dict().items():
+        for attr_name, feature_kjt_dict in self.get_batch_kjt_dict(batch).items():
             # separate feature kjt with {feature_name_1: feature_kjt_1, feature_name_2: feature_kjt_2, ...}
             # to multiple dict with {feature_name_1: jt_1}, {feature_name_2: jt_2}, ...
             attr_feature_jt_dict = {}
diff --git a/torchrec/distributed/benchmark/benchmark_zch/models/configs/dlrmv2.yaml b/torchrec/distributed/benchmark/benchmark_zch/models/configs/dlrmv2.yaml
@@ -9,31 +9,31 @@ over_arch_layer_sizes:
   - 1
 embedding_dim: 64
 num_embeddings_per_feature:
-  cat_0: 100000
-  cat_1: 100000
-  cat_2: 100000
-  cat_3: 100000
-  cat_4: 100000
-  cat_5: 100000
-  cat_6: 100000
-  cat_7: 100000
-  cat_8: 100000
-  cat_9: 100000
-  cat_10: 100000
-  cat_11: 100000
-  cat_12: 100000
-  cat_13: 100000
-  cat_14: 100000
-  cat_15: 100000
-  cat_16: 100000
-  cat_17: 100000
-  cat_18: 100000
-  cat_19: 100000
-  cat_20: 100000
-  cat_21: 100000
-  cat_22: 100000
-  cat_23: 100000
-  cat_24: 100000
-  cat_25: 100000
+  cat_0: 40000000
+  cat_1: 39060
+  cat_2: 17295
+  cat_3: 7424
+  cat_4: 20265
+  cat_5: 3
+  cat_6: 7122
+  cat_7: 1543
+  cat_8: 63
+  cat_9: 40000000
+  cat_10: 3067956
+  cat_11: 405282
+  cat_12: 10
+  cat_13: 2209
+  cat_14: 11938
+  cat_15: 155
+  cat_16: 4
+  cat_17: 976
+  cat_18: 14
+  cat_19: 40000000
+  cat_20: 40000000
+  cat_21: 40000000
+  cat_22: 590152
+  cat_23: 12973
+  cat_24: 108
+  cat_25: 36
 embedding_module_attribute_path: "dlrm.sparse_arch.embedding_bag_collection" # the attribute path after model
 managed_collision_module_attribute_path: "module.dlrm.sparse_arch.embedding_bag_collection.mc_embedding_bag_collection._managed_collision_collection._managed_collision_modules" # the attribute path of managed collision module after model
diff --git a/torchrec/distributed/benchmark/benchmark_zch/models/models/dlrmv2.py b/torchrec/distributed/benchmark/benchmark_zch/models/models/dlrmv2.py
@@ -35,6 +35,9 @@ def __init__(
             dense_device=dense_device,
         )
         self.train_model = DLRMTrain(self.dlrm)
+        self.table_configs: List[EmbeddingBagConfig] = list(
+            embedding_bag_collection.embedding_bag_configs()
+        )
 
     def forward(
         self, batch: Batch
@@ -55,10 +58,10 @@ def make_model_dlrmv2(
 ) -> nn.Module:
     ebc_configs = [
         EmbeddingBagConfig(
-            name=f"t_{feature_name}",
+            name=f"{feature_name}",
             embedding_dim=configs["embedding_dim"],
             num_embeddings=(
-                none_throws(configs["num_embeddings_per_feature"])[feature_idx]
+                none_throws(configs["num_embeddings_per_feature"])[feature_name]
                 if args.num_embeddings is None
                 else args.num_embeddings
             ),
@@ -76,8 +79,9 @@ def make_model_dlrmv2(
                 input_hash_size=args.input_hash_size,
                 device=torch.device("meta"),
                 world_size=get_local_size(),
-                use_mpzch=True,
+                zch_method="mpzch",
                 mpzch_num_buckets=args.num_buckets,
+                mpzch_max_probe=args.max_probe,
             )
         )
     else:
diff --git a/torchrec/modules/mc_adapter.py b/torchrec/modules/mc_adapter.py
@@ -148,9 +148,11 @@ def __init__(
         world_size: int,
         eviction_interval: int = 1,
         allow_in_place_embed_weight_update: bool = False,
-        use_mpzch: bool = False,
-        mpzch_num_buckets: Optional[int] = None,
-        mpzch_max_probe: Optional[int] = None,
+        zch_method: str = "",  # method for managing collisions, one of ["", "mpzch", "sort_zch"]
+        mpzch_num_buckets: Optional[int] = 80,
+        mpzch_max_probe: Optional[
+            int
+        ] = 100,  # max_probe for HashZchManagedCollisionModule
     ) -> None:
         """
         Initialize an EmbeddingBagCollectionAdapter.
@@ -173,38 +175,44 @@ def __init__(
         mc_modules = {}
         for table_config in ebc.embedding_bag_configs():
             table_name = table_config.name
-            if use_mpzch:
+            if zch_method == "mpzch":
                 # if use MPZCH, create a HashZchManagedCollisionModule
+                num_buckets = mpzch_num_buckets if mpzch_num_buckets else world_size
+                max_probe = (
+                    min(
+                        mpzch_max_probe,
+                        table_config.num_embeddings // world_size // num_buckets,
+                    )
+                    if mpzch_max_probe
+                    else table_config.num_embeddings // world_size // num_buckets
+                )
                 mc_modules[table_name] = HashZchManagedCollisionModule(  # MPZCH
                     is_inference=False,
                     zch_size=(table_config.num_embeddings),
                     input_hash_size=input_hash_size,
                     device=device,
-                    total_num_buckets=(
-                        mpzch_num_buckets if mpzch_num_buckets else world_size
-                    ),  # total_num_buckets if not passed, use world_size, WORLD_SIZE should be a factor of total_num_buckets
+                    total_num_buckets=num_buckets,  # total_num_buckets if not passed, use world_size, WORLD_SIZE should be a factor of total_num_buckets
+                    max_probe=max_probe,
                     eviction_policy_name=HashZchEvictionPolicyName.SINGLE_TTL_EVICTION,  # defaultly using single ttl eviction policy
                     eviction_config=HashZchEvictionConfig(
                         features=table_config.feature_names,
                         single_ttl=eviction_interval,
                     ),
-                    max_probe=(
-                        mpzch_max_probe
-                        if mpzch_max_probe is not None
-                        and mpzch_max_probe
-                        < (table_config.num_embeddings // world_size)
-                        else table_config.num_embeddings // world_size
-                    ),  # max_probe for HashZchManagedCollisionModule
                 )
-            else:  # if not use MPZCH, create a MCHManagedCollisionModule using the sort ZCH algorithm
+            elif (
+                zch_method == "sort_zch"
+            ):  # if not use MPZCH, create a MCHManagedCollisionModule using the sort ZCH algorithm
                 mc_modules[table_name] = MCHManagedCollisionModule(  # sort ZCH
                     zch_size=table_config.num_embeddings,
                     device=device,
                     input_hash_size=input_hash_size,
                     eviction_interval=eviction_interval,
                     eviction_policy=DistanceLFU_EvictionPolicy(),
                 )  # NOTE: the benchmark for sort ZCH is not implemented yet
-            # TODO: add the pure hash module here
+            else:  # if not use MPZCH, create a MCHManagedCollisionModule using the sort ZCH
+                raise NotImplementedError(
+                    f"zc method {zch_method} is not supported yet"
+                )
 
         # create the mcebc module with the mc modules and the original ebc
         self.mc_embedding_bag_collection = (
@@ -219,19 +227,14 @@ def __init__(
             )
         )
 
-        self.remapped_ids: Optional[Dict[str, torch.Tensor]] = (
-            None  # to store remapped ids
-        )
-
     def forward(self, input_kjt: KeyedJaggedTensor) -> Dict[str, JaggedTensor]:
         """
         Args:
             input (KeyedJaggedTensor): KJT of form [F X B X L].
         Returns:
             Dict[str, JaggedTensor]: dictionary of {'feature_name': JaggedTensor}
         """
-        mc_ebc_out, remapped_ids = self.mc_embedding_bag_collection(input_kjt)
-        self.remapped_ids = remapped_ids
+        mc_ebc_out, per_table_remapped_id = self.mc_embedding_bag_collection(input_kjt)
         return mc_ebc_out
 
     def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
@@ -240,12 +243,15 @@ def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
             recurse=recurse
         )
 
-    def embedding_bag_configs(self) -> List[EmbeddingBagConfig]:
+    def embedding_bag_configs(self) -> List[EmbeddingConfig]:
         """
         Returns:
             Dict[str, EmbeddingConfig]: dictionary of {'feature_name': EmbeddingConfig}
         """
+        # pyre-ignore[16]: `ManagedCollisionEmbeddingBagCollection` has no attribute `_embedding_module`
         return (
-            # pyre-ignore [29] # NOTE: the function "embedding_configs" returns the _embedding_module attribute of the EmbeddingCollection
             self.mc_embedding_bag_collection._embedding_module.embedding_bag_configs()
         )
+
+    def get_per_table_remapped_id(self) -> Dict[str, JaggedTensor]:
+        return self.per_table_remapped_id

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:`
`25`	`25`	`parser.add_argument(`
`26`	`26`	`"--num_embeddings", # ratio of feature ids to embedding table size # 3 axis: x-bath_idx; y-collisions; zembedding table sizes`
`27`	`27`	`type=int,`
`28`		`- default=100_000,`
	`28`	`+ default=None,`
`29`	`29`	`help="max_ind_size. The number of embeddings in each embedding table. Defaults"`
`30`	`30`	`" to 100_000 if num_embeddings_per_feature is not supplied.",`
`31`	`31`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-dataset_path: "/home/lizhouyu/oss_github/generative-recommenders/generative_recommenders/dlrm_v3/data/KuaiRand-1K/data"`
	`1`	`+dataset_path: "/home/lizhouyu/datasets/kuairand-1k/data"`
`2`	`2`	`batch_size: 16`
`3`	`3`	`train_split_percentage: 0.75`
`4`	`4`	`num_workers: 4`