Enable Consistent SHA256 Hashing with reduced Planner Context

aporialiao · aporialiao · commit eb425d8f40df · 2025-06-12T17:55:54.000-07:00
Summary:
Even though SHA256 hashing is used, we're still not seeing the expected same hash generated from the original planner context inputs.

This problem is due to Enumerator and Storage Reservation objects we were originally trying to hash containing attributes that differ between processes/instances.

To resolve this we reduced the hashing context to only use the specific attributes we need from enumerator and storage reservation.
Namely:
* enumerator.enumerate(...)'s output - which is used as the `search_space` in both LP and OSS planner
    * We are storing the output of enumerate as an attribute `last_stored_search_space`. **This assumes enumerate will have been called before we hash the planner context inputs**.
* StorageResveration's policy (aka whether `HeuristicalStorageReservation` is used or `FixedStorageReservation`
* StorageResveration's initialization attributes:
    * _percentage
    * _parameter_multiplier for HeuristicalStorageReservation
    * _dense_tensor_estimate for HeuristicalStorageReservation

Created helper functions:
* `hash_planner_context_inputs` to be called in both planner.hash_planner_context_inputs and manifold loading call site (see D75723272)
* `hash_sha256_to_int` to be passed in as the default hash function in hash_planner_context_inputs

Also created a multiprocess unit test to quickly check if consistent hashes are being generated across different processes given the same input.

Differential Revision: D76303748
diff --git a/torchrec/distributed/planner/enumerators.py b/torchrec/distributed/planner/enumerators.py
@@ -102,6 +102,11 @@ def __init__(
                 EmbeddingStorageEstimator(topology=topology, constraints=constraints),
             ]
 
+        # Initializing caching for enumerate
+        self._last_stored_search_space: Optional[List[ShardingOption]] = None
+        self._last_stored_module: Optional[nn.Module] = None
+        self._last_stored_sharders: Optional[List[ModuleSharder[nn.Module]]] = None
+
     def enumerate(
         self,
         module: nn.Module,
@@ -118,6 +123,12 @@ def enumerate(
             List[ShardingOption]: valid sharding options with values populated.
         """
 
+        if (
+            self._last_stored_module == module
+            and self._last_stored_sharders == sharders
+        ):
+            return self._last_stored_search_space  # pyre-ignore
+
         self._sharder_map = {
             sharder_name(sharder.module_type): sharder for sharder in sharders
         }
@@ -230,8 +241,18 @@ def enumerate(
 
         self.populate_estimates(sharding_options)
 
+        self._last_stored_module = module
+        self._last_stored_sharders = sharders
+        self._last_stored_search_space = sharding_options
         return sharding_options
 
+    @property
+    def last_stored_search_space(self) -> Optional[List[ShardingOption]]:
+        # NOTE: This is the last search space stored by enumerate(...), do not use
+        # this field in place of actually calling enumerate(...) as it will varie for each
+        # module/sharders passed in.
+        return self._last_stored_search_space
+
     def populate_estimates(self, sharding_options: List[ShardingOption]) -> None:
         for estimator in self._estimators:
             estimator.estimate(sharding_options, self._sharder_map)
diff --git a/torchrec/distributed/planner/planners.py b/torchrec/distributed/planner/planners.py
@@ -39,6 +39,7 @@
 )
 from torchrec.distributed.planner.types import (
     Enumerator,
+    hash_planner_context_inputs,
     ParameterConstraints,
     Partitioner,
     PerfModel,
@@ -280,25 +281,21 @@ def collective_plan(
             sharders,
         )
 
-    def hash_planner_context_inputs(self) -> str:
+    def hash_planner_context_inputs(self) -> int:
         """
         Generates a hash for all planner inputs except for partitioner, proposer, performance model, and stats.
         These are all the inputs needed to verify whether a previously generated sharding plan is still valid in a new context.
 
         Returns:
             Generates a hash capturing topology, batch size, enumerator, storage reservation, stats and constraints.
         """
-        hashable_list = [
+        return hash_planner_context_inputs(
             self._topology,
             self._batch_size,
             self._enumerator,
             self._storage_reservation,
-            frozenset(self._constraints.items()) if self._constraints else None,
-        ]
-        serialized_list = str(hashable_list).encode("utf-8")
-        hash_object = hashlib.sha256(serialized_list)
-        hash_digest = hash_object.hexdigest()
-        return hash_digest
+            self._constraints,
+        )
 
     def plan(
         self,
diff --git a/torchrec/distributed/planner/storage_reservations.py b/torchrec/distributed/planner/storage_reservations.py
@@ -163,6 +163,7 @@ class FixedPercentageStorageReservation(StorageReservation):
     def __init__(self, percentage: float) -> None:
         assert percentage >= 0 and percentage <= 1
         self._percentage: float = percentage
+        self._last_reserved_toplogy: Optional[Topology] = None
 
     def reserve(
         self,
@@ -174,8 +175,14 @@ def reserve(
     ) -> Topology:
         reserved_topology = copy.deepcopy(topology)
         _reserve_storage_percentage(reserved_topology, self._percentage)
+        self._last_reserved_toplogy = reserved_topology
         return reserved_topology
 
+    @property
+    def last_reserved_toplogy(self) -> Optional[Topology]:
+        "Cached value of the most recent output from the reserve() method."
+        return self._last_reserved_toplogy
+
 
 class HeuristicalStorageReservation(StorageReservation):
     """
@@ -206,6 +213,7 @@ def __init__(
 
         self._dense_storage: Optional[Storage] = None
         self._kjt_storage: Optional[Storage] = None
+        self._last_reserved_toplogy: Optional[Topology] = None
 
     def reserve(
         self,
@@ -215,6 +223,7 @@ def reserve(
         sharders: List[ModuleSharder[nn.Module]],
         constraints: Optional[Dict[str, ParameterConstraints]] = None,
     ) -> Topology:
+        # TODO: enable proper caching of topology values through _last_reserved_toplogy
         reserved_topology = copy.deepcopy(topology)
 
         batch_inputs, shardable_modules = _get_batch_inputs_and_shardable_parameters(
@@ -262,8 +271,14 @@ def reserve(
                 message=negative_storage_solution,
             )
 
+        self._last_reserved_toplogy = reserved_topology
         return reserved_topology
 
+    @property
+    def last_reserved_toplogy(self) -> Optional[Topology]:
+        "Cached value of the most recent output from the reserve() method."
+        return self._last_reserved_toplogy
+
 
 class InferenceStorageReservation(StorageReservation):
     """
diff --git a/torchrec/distributed/planner/tests/test_types.py b/torchrec/distributed/planner/tests/test_types.py
@@ -8,18 +8,30 @@
 # pyre-strict
 
 import unittest
-from typing import cast
+from typing import cast, Dict, Optional
 from unittest.mock import MagicMock
 
 import torch
+from torch import multiprocessing
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder
+from torchrec.distributed.planner import EmbeddingShardingPlanner
+from torchrec.distributed.planner.enumerators import EmbeddingEnumerator
+from torchrec.distributed.planner.perf_models import NoopPerfModel
+from torchrec.distributed.planner.storage_reservations import (
+    HeuristicalStorageReservation,
+)
 
 from torchrec.distributed.planner.types import (
     ParameterConstraints,
     Shard,
     ShardingOption,
     Topology,
 )
+from torchrec.distributed.test_utils.multi_process import (
+    MultiProcessContext,
+    MultiProcessTestBase,
+)
 from torchrec.distributed.types import (
     BoundsCheckMode,
     CacheAlgorithm,
@@ -348,3 +360,75 @@ def test_hash_inequality(self) -> None:
         self.assertNotEqual(
             hash(pc1), hash(pc2), "Hashes should be different for different instances"
         )
+
+
+def _test_hashing_consistency(
+    rank: int,
+    world_size: int,
+    backend: str,
+    return_hash_dict: Dict[str, int],
+    local_size: Optional[int] = None,
+) -> None:
+    with MultiProcessContext(rank, world_size, backend, local_size) as ctx:
+        topology = Topology(
+            local_world_size=8,
+            world_size=1,
+            compute_device="cuda",
+        )
+        batch_size = 128
+        enumerator = EmbeddingEnumerator(topology=topology, batch_size=batch_size)
+        eb_config = EmbeddingBagConfig(
+            name="table_0",
+            embedding_dim=160,
+            num_embeddings=10000,
+            feature_names=["f1"],
+            data_type=DataType.FP16,
+        )
+        module = EmbeddingBagCollection(
+            tables=[eb_config],
+            is_weighted=False,
+            device=torch.device(
+                "meta"
+            ),  # Using meta device for now since only getting search space
+        )
+        sharders = [EmbeddingBagCollectionSharder()]
+        enumerator.enumerate(module, sharders)  # pyre-ignore
+        storage_reservation = HeuristicalStorageReservation(percentage=0.15)
+        constraints = {"table1": ParameterConstraints()}
+
+        storage_reservation.reserve(
+            topology=topology,
+            batch_size=batch_size,
+            module=module,
+            sharders=sharders,  # pyre-ignore
+            constraints=constraints,
+        )
+        perf_model = NoopPerfModel(topology=topology)
+
+        planner1 = EmbeddingShardingPlanner(
+            topology=topology,
+            batch_size=batch_size,
+            enumerator=enumerator,
+            storage_reservation=storage_reservation,
+            performance_model=perf_model,
+            constraints=constraints,
+        )
+
+        h = planner1.hash_planner_context_inputs()
+        return_hash_dict[str(rank)] = h
+
+
+class TestConsistentHashingBetweenProcesses(MultiProcessTestBase):
+
+    def test_hash_consistency(self) -> None:
+        # planner
+        world_size = 2
+        return_hash_dict = multiprocessing.Manager().dict()
+        self._run_multi_process_test(
+            callable=_test_hashing_consistency,
+            world_size=world_size,
+            backend="nccl" if torch.cuda.is_available() else "gloo",
+            return_hash_dict=return_hash_dict,
+        )
+        hashes = return_hash_dict.values()
+        assert hashes[0] == hashes[1], "hash values are different."
diff --git a/torchrec/distributed/planner/types.py b/torchrec/distributed/planner/types.py
@@ -12,7 +12,7 @@
 from copy import deepcopy
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import cast, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -401,12 +401,15 @@ def __repr__(self) -> str:
         topology_repr += str(self._comms_bandwidths) + "\n"
         return topology_repr
 
-    def _hash(self) -> str:
+    def _hash(self) -> int:
         """
         Compute a consistent hash value for this Topology instance.
 
         Returns:
             str: A hash value for this Topology instance.
+
+        NOTE: Not overriding the __hash__ method here to account for other
+        potential variables that may be unchecked by the following list
         """
 
         # Compute hbms and ddrs from the decives
@@ -430,10 +433,7 @@ def _hash(self) -> str:
             self._uneven_sharding_perf_multiplier,
         ]
 
-        serialized_list = str(hashable_list).encode("utf-8")
-        hash_object = hashlib.sha256(serialized_list)
-        hash_digest = hash_object.hexdigest()
-        return hash_digest
+        return hash_sha256_to_int(hashable_list)
 
 
 # ---- INPUT / OUTPUT ----- #
@@ -743,25 +743,25 @@ class ParameterConstraints:
     key_value_params: Optional[KeyValueParams] = None
 
     def __hash__(self) -> int:
-        return hash(
-            (
-                tuple(self.sharding_types) if self.sharding_types else None,
-                tuple(self.compute_kernels) if self.compute_kernels else None,
-                self.min_partition,
-                tuple(self.pooling_factors),
-                tuple(self.num_poolings) if self.num_poolings else None,
-                tuple(self.batch_sizes) if self.batch_sizes else None,
-                self.is_weighted,
-                self.cache_params,
-                self.enforce_hbm,
-                self.stochastic_rounding,
-                self.bounds_check_mode,
-                tuple(self.feature_names) if self.feature_names else None,
-                self.output_dtype,
-                self.device_group,
-                self.key_value_params,
-            )
-        )
+        hashable_list = [
+            tuple(self.sharding_types) if self.sharding_types else None,
+            tuple(self.compute_kernels) if self.compute_kernels else None,
+            self.min_partition,
+            tuple(self.pooling_factors),
+            tuple(self.num_poolings) if self.num_poolings else None,
+            tuple(self.batch_sizes) if self.batch_sizes else None,
+            self.is_weighted,
+            self.cache_params,
+            self.enforce_hbm,
+            self.stochastic_rounding,
+            self.bounds_check_mode,
+            tuple(self.feature_names) if self.feature_names else None,
+            self.output_dtype,
+            self.device_group,
+            self.key_value_params,
+        ]
+
+        return hash_sha256_to_int(hashable_list)
 
 
 class PlannerErrorType(Enum):
@@ -967,3 +967,65 @@ class CriticalPathEstimate:
 
     def total(self) -> float:
         return self.comms_estimate + self.comp_estimate
+
+
+# ---- Types Utils ---- #
+def hash_sha256_to_int(hashable_list: List[Any]) -> int:  # pyre-ignore
+    """
+    Hashes the given data using SHA256 and returns the hash as an integer
+    """
+    serialized_list = str(hashable_list).encode("utf-8")
+    hash_object = hashlib.sha256(serialized_list)
+    hash_digest = hash_object.hexdigest()
+    return int(hash_digest, 16)
+
+
+def hash_planner_context_inputs(
+    topology: Topology,
+    batch_size: int,
+    enumerator: Enumerator,
+    storage_reservation: StorageReservation,
+    constraints: Optional[Dict[str, ParameterConstraints]],
+    # pyre-ignore
+    hash_function: Callable[[List[Any]], int] = hash_sha256_to_int,
+) -> int:
+    assert hasattr(
+        enumerator, "last_stored_search_space"
+    ), "This enumerator is not compatible with hashing"
+    assert (
+        enumerator.last_stored_search_space is not None  # pyre-ignore
+    ), "Unable to hash planner context without an enumerator that has a precomputed search space"
+    search_space = enumerator.last_stored_search_space
+    storage_reservation_policy = type(storage_reservation).__name__
+
+    # TODO: Not the best code, but will result in circular dependency if we import the actaul
+    # storage reservation classes - will come back here to refactor more cleanly
+    assert storage_reservation_policy in [
+        "HeuristicalStorageReservation",
+        "FixedPercentageStorageReservation",
+    ]
+    assert hasattr(
+        storage_reservation, "last_reserved_toplogy"
+    ), "This storage reservation is not compatible with hashing"
+    assert (
+        storage_reservation.last_reserved_toplogy is not None  # pyre-ignore
+    ), "Unable to hash planner context without a storage reservation that has a precomputed topology"
+
+    hashable_list = [
+        topology,
+        batch_size,
+        [
+            [
+                shard_option.fqn,
+                shard_option.sharding_type,
+                shard_option.compute_kernel,
+                tuple(shard_option.shards),
+                shard_option.cache_params,
+            ]
+            for shard_option in search_space
+        ],
+        storage_reservation_policy,
+        storage_reservation.last_reserved_toplogy,
+        frozenset(constraints.items()) if constraints else None,
+    ]
+    return hash_function(hashable_list)