[reland] Refactor TorchAOBaseTensor for better BC (#2793)

jerryzh168 · jerryzh168 · commit a832bf083c9c · 2025-08-22T15:58:13.000-07:00
Summary: After this PR, tensors inheriting from TorchAOBaseTensor will have better support BC, that is if they add some optional tensor data attribute or optional non-tensor attribute, we will still have BC without any additional changes. More Details: The BC story we are looking at is that, after we land some tensor, e.g. Int4Tensor, Float8Tensor, future changes should only add optional Tensor data attributes and optional non-Tensor attributes to the Tensor (other bigger changes will require a version bump, we need to add that too). The current TorchAOBaseTensor doesn’t support this very well. also see #2840 for a real test that adds both an optional tensor and optional non-tensor attribute to Float8Tensor, and the BC test in https://github.com/pytorch/ao/blob/main/test/integration/test_load_and_run_checkpoint.py that tests Float8Tensor does not fail. Docs for current TorchAOBaseTensor: https://github.com/pytorch/ao/blob/e6b38bb0e1477ae6aaca0a3d30de70598be43290/torchao/utils.py#L726-L731 `tensor_data_names` (List[str]): list of names of all requires tensor_data, order should match the `__init__` list of tensor subclass `optional_tensor_data_names` (List[str]): it's optional to define this field to have the additional boilerplate functions been implemented for you, but this will be need if there are some optional Tensor attributes, when defined, this will be a list of names of Tensors that can be optional `tensor_attribute_names` (List[str]): list of names of non-Tensor attributes, order should match the `__init__` list of tensor subclass, following all the `tensor_data_names` arguments and `optional_tensor_data_names` Problems: current optional_tensor_data_names is not truly optional, since it is followed by tensor_attribute_names which contains both required and optional attributes. So if we add a tensor data attribute to Tensor, it will break BC. Here are a few options: ``` class Int4Tensor(TorchAOBaseTensor): tensor_data_names = ["qdata", "scale", "zero_point"] optional_tensor_data_names = ["act_scale"] tensor_attribute_names = ["block_size", "shape", "_demo_only_optional_attr"] def __init__(self, qdata, scale, zero_point, act_scale=None, block_size=None, shape=None, _demo_only_optional_attr=None): ... # for BC def __setstate__(self, state): torch._utils._set_obj_state(self, state) if "act_scale" not in self.__dict__: self.act_scale = None ``` ``` class Int4Tensor(TorchAOBaseTensor): tensor_data_names = ["qdata", "scale", "zero_point"] optional_tensor_data_names = ["act_scale"] required_tensor_attribute_names = ["block_size", "shape"] optional_tensor_attribute_names = ["_demo_only_optional_attr"] def __init__(self, qdata, scale, zero_point, block_size, shape, act_scale=None, _demo_only_optional_attr = None): ... # for BC def __setstate__(self, state): torch._utils._set_obj_state(self, state) if "act_scale" not in self.__dict__: self.act_scale = None ``` ``` class Int4Tensor(TorchAOBaseTensor): tensor_data_names = ["qdata", "scale", "zero_point"] tensor_attribute_names = ["block_size", "shape", "_demo_only_optional_attr"] optional_tensor_data_names = ["act_scale"] def __init__(self, qdata, scale, zero_point, block_size, shape, _demo_only_optional_attr = None, act_scale = None): ... # for BC def __setstate__(self, state): torch._utils._set_obj_state(self, state) if "act_scale" not in self.__dict__: self.act_scale = None ``` Test Plan: python test/integration/test_load_and_run_checkpoint.py Reviewers: Subscribers: Tasks: Tags:
diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -307,7 +307,7 @@ def test_nvfp4_swizzled_scales_serialization():
     tensor_list, ctx = original_tensor.__tensor_flatten__()
 
     # Verify swizzled flag is preserved in context
-    assert NVFP4Tensor.tensor_attribute_names[2] == "_is_swizzled_scales"
+    assert NVFP4Tensor.optional_tensor_attribute_names[0] == "_is_swizzled_scales"
     assert ctx[2] == True
 
     # Test deserialization
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -186,60 +186,103 @@ class MyTensor(TorchAOBaseTensor):
             tensor_data_names = ["qdata"]
             tensor_attribute_names = ["attr", "device"]
 
-            def __new__(cls, qdata, attr, device=None):
+            def __new__(cls, qdata, attr, device):
                 shape = qdata.shape
                 if device is None:
                     device = qdata.device
                 kwargs = {"device": device}
                 return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
 
-            def __init__(self, qdata, attr, device=None):
+            def __init__(self, qdata, attr, device):
                 self.qdata = qdata
                 self.attr = attr
 
         l = torch.nn.Linear(2, 3)
-        l.weight = torch.nn.Parameter(MyTensor(l.weight, "attr"))
+        l.weight = torch.nn.Parameter(MyTensor(l.weight, "attr", None))
         lp_tensor = l.weight
 
         another_tensor = torch.nn.Linear(2, 3).weight
         # attribute has to be the same
-        lp_tensor_for_copy = MyTensor(another_tensor, "attr")
+        lp_tensor_for_copy = MyTensor(another_tensor, "attr", None)
         self._test_default_impls_helper(lp_tensor, lp_tensor_for_copy)
 
     @skip_if_no_cuda()
     def test_default_impls_with_optional_data(self):
         class MyTensorWithOptionalData(TorchAOBaseTensor):
             tensor_data_names = ["qdata"]
-            optional_tensor_data_names = ["zero_point"]
             tensor_attribute_names = ["attr", "device"]
+            optional_tensor_data_names = ["zero_point"]
 
-            def __new__(cls, qdata, zero_point=None, attr=1.0, device=None):
+            def __new__(cls, qdata, attr, device, zero_point=None):
                 shape = qdata.shape
                 if device is None:
                     device = qdata.device
                 kwargs = {"device": device}
                 return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
 
-            def __init__(self, qdata, zero_point=None, attr=1.0, device=None):
+            def __init__(self, qdata, attr, device, zero_point=None):
                 self.qdata = qdata
+                self.attr = attr
                 self.zero_point = zero_point
+
+        # test both the optional Tensor is None
+        # and not None
+        l = torch.nn.Linear(2, 3)
+        lp_tensor = MyTensorWithOptionalData(l.weight, "attr", None, None)
+        l = torch.nn.Linear(2, 3)
+        lp_tensor_for_copy = MyTensorWithOptionalData(l.weight, "attr", None, None)
+        self._test_default_impls_helper(lp_tensor, lp_tensor_for_copy)
+
+        l = torch.nn.Linear(2, 3)
+        lp_tensor = MyTensorWithOptionalData(
+            l.weight, "attr", None, torch.zeros_like(l.weight)
+        )
+        l = torch.nn.Linear(2, 3)
+        lp_tensor_for_copy = MyTensorWithOptionalData(
+            l.weight, "attr", None, torch.zeros_like(l.weight)
+        )
+        self._test_default_impls_helper(lp_tensor, lp_tensor_for_copy)
+
+    @skip_if_no_cuda()
+    def test_default_impls_with_optional_attr(self):
+        class MyTensorWithOptionalData(TorchAOBaseTensor):
+            tensor_data_names = ["qdata"]
+            tensor_attribute_names = ["attr", "device"]
+            optional_tensor_data_names = ["zero_point"]
+            optional_tensor_attribute_names = ["optional_attr"]
+
+            def __new__(cls, qdata, attr, device, zero_point=None, optional_attr=None):
+                shape = qdata.shape
+                if device is None:
+                    device = qdata.device
+                kwargs = {"device": device}
+                return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+            def __init__(
+                self, qdata, attr, device, zero_point=None, optional_attr=None
+            ):
+                self.qdata = qdata
                 self.attr = attr
+                self.zero_point = zero_point
+                self.optional_attr = optional_attr
 
         # test both the optional Tensor is None
         # and not None
         l = torch.nn.Linear(2, 3)
-        lp_tensor = MyTensorWithOptionalData(l.weight, None, "attr")
+        lp_tensor = MyTensorWithOptionalData(l.weight, "attr", None, zero_point=None)
         l = torch.nn.Linear(2, 3)
-        lp_tensor_for_copy = MyTensorWithOptionalData(l.weight, None, "attr")
+        lp_tensor_for_copy = MyTensorWithOptionalData(
+            l.weight, "attr", None, zero_point=None
+        )
         self._test_default_impls_helper(lp_tensor, lp_tensor_for_copy)
 
         l = torch.nn.Linear(2, 3)
         lp_tensor = MyTensorWithOptionalData(
-            l.weight, torch.zeros_like(l.weight), "attr"
+            l.weight, "attr", None, zero_point=None, optional_attr="value"
         )
         l = torch.nn.Linear(2, 3)
         lp_tensor_for_copy = MyTensorWithOptionalData(
-            l.weight, torch.zeros_like(l.weight), "attr"
+            l.weight, "attr", None, zero_point=None, optional_attr="value"
         )
         self._test_default_impls_helper(lp_tensor, lp_tensor_for_copy)
 
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -79,10 +79,12 @@ class NVFP4Tensor(TorchAOBaseTensor):
     """
 
     tensor_data_names = ["qdata", "_scale_e4m3"]
-    optional_tensor_data_names = ["_per_tensor_scale", "_act_per_tensor_scale"]
     tensor_attribute_names = [
         "_block_size",
         "_orig_dtype",
+    ]
+    optional_tensor_data_names = ["_per_tensor_scale", "_act_per_tensor_scale"]
+    optional_tensor_attribute_names = [
         "_is_swizzled_scales",
         "use_triton_kernel",
         "act_quant_kwargs",
@@ -92,10 +94,10 @@ def __new__(
         cls,
         qdata,
         blockwise_scales,
-        per_tensor_scale,
-        act_per_tensor_scale,
         block_size,
         orig_dtype,
+        per_tensor_scale,
+        act_per_tensor_scale,
         is_swizzled_scales=False,
         use_triton_kernel=False,
         act_quant_kwargs=None,
@@ -116,13 +118,13 @@ def __new__(
             requires_grad=False,
         )
 
-        self._scale_e4m3 = blockwise_scales
-        self._is_swizzled_scales = is_swizzled_scales
-        self._per_tensor_scale = per_tensor_scale
-        self._act_per_tensor_scale = act_per_tensor_scale
         self.qdata = qdata
+        self._scale_e4m3 = blockwise_scales
         self._block_size = block_size
         self._orig_dtype = orig_dtype
+        self._per_tensor_scale = per_tensor_scale
+        self._act_per_tensor_scale = act_per_tensor_scale
+        self._is_swizzled_scales = is_swizzled_scales
         self.use_triton_kernel = use_triton_kernel
         self.act_quant_kwargs = act_quant_kwargs
         return self
@@ -184,10 +186,10 @@ def to_nvfp4(
         return NVFP4Tensor(
             data_lp,
             blockwise_scales,
-            per_tensor_scale,
-            act_per_tensor_scale,
             block_size,
             data_hp.dtype,
+            per_tensor_scale,
+            act_per_tensor_scale,
             is_swizzled_scales,
             use_triton_kernel,
             act_quant_kwargs,
@@ -312,10 +314,10 @@ def nvfp4_to_copy(func, types, args, kwargs):
         res = NVFP4Tensor(
             tensor.qdata,
             tensor._scale_e4m3,
-            tensor._per_tensor_scale,
-            tensor._act_per_tensor_scale,
             tensor._block_size,
             dtype,
+            tensor._per_tensor_scale,
+            tensor._act_per_tensor_scale,
             tensor._is_swizzled_scales,
             tensor.use_triton_kernel,
             tensor.act_quant_kwargs,
@@ -513,10 +515,10 @@ def nvfp4_slice(func, types, args, kwargs):
     result = NVFP4Tensor(
         sliced_data,
         sliced_scale,
-        x._per_tensor_scale,
-        x._act_per_tensor_scale,
         x._block_size,
         x._orig_dtype,
+        x._per_tensor_scale,
+        x._act_per_tensor_scale,
         x._is_swizzled_scales,
         x.use_triton_kernel,
         x.act_quant_kwargs,
@@ -532,10 +534,10 @@ def nvfp4_t(func, types, args, kwargs):
     new = NVFP4Tensor(
         old.qdata.t(),
         old._scale_e4m3,
-        old._per_tensor_scale,
-        old._act_per_tensor_scale,
         old._block_size,
         old._orig_dtype,
+        old._per_tensor_scale,
+        old._act_per_tensor_scale,
         old._is_swizzled_scales,
         old.use_triton_kernel,
         old.act_quant_kwargs,
@@ -552,10 +554,10 @@ def nvfp4_view_op(func, types, args, kwargs):
     return NVFP4Tensor(
         new_data,
         args[0]._scale_e4m3,
-        args[0]._per_tensor_scale,
-        args[0]._act_per_tensor_scale,
         args[0]._block_size,
         args[0]._orig_dtype,
+        args[0]._per_tensor_scale,
+        args[0]._act_per_tensor_scale,
         args[0]._is_swizzled_scales,
         args[0].use_triton_kernel,
         args[0].act_quant_kwargs,
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -94,7 +94,8 @@ class Float8Tensor(TorchAOBaseTensor):
     """
 
     tensor_data_names = ["qdata", "scale"]
-    tensor_attribute_names = [
+    tensor_attribute_names = []
+    optional_tensor_attribute_names = [
         "block_size",
         "mm_config",
         "hp_value_lb",
@@ -106,15 +107,15 @@ class Float8Tensor(TorchAOBaseTensor):
 
     def __new__(
         cls,
-        qdata,
-        scale,
-        block_size,
-        mm_config,
-        hp_value_lb,
-        hp_value_ub,
-        act_quant_kwargs,
-        kernel_preference,
-        dtype,
+        qdata: torch.Tensor,
+        scale: torch.Tensor,
+        block_size: Optional[List[int]] = None,
+        mm_config: Optional[Float8MMConfig] = None,
+        hp_value_lb: Optional[float] = None,
+        hp_value_ub: Optional[float] = None,
+        act_quant_kwargs: Optional[QuantizeTensorToFloat8Kwargs] = None,
+        kernel_preference: KernelPreference = KernelPreference.AUTO,
+        dtype: Optional[torch.dtype] = None,
     ):
         shape = qdata.shape
         kwargs = {}
diff --git a/torchao/quantization/quantize_/workflows/int4/int4_preshuffled_tensor.py b/torchao/quantization/quantize_/workflows/int4/int4_preshuffled_tensor.py
@@ -75,17 +75,17 @@ class Int4PreshuffledTensor(TorchAOBaseTensor):
     """
 
     tensor_data_names = ["qdata", "group_scale"]
-    optional_tensor_data_names = ["group_zero", "row_scale"]
     tensor_attribute_names = ["block_size", "shape"]
+    optional_tensor_data_names = ["group_zero", "row_scale"]
 
     def __new__(
         cls,
-        qdata,
-        group_scale,
-        group_zero,
-        row_scale,
-        block_size,
-        shape,
+        qdata: torch.Tensor,
+        group_scale: torch.Tensor,
+        block_size: List[int],
+        shape: List[int],
+        group_zero: Optional[torch.Tensor] = None,
+        row_scale: Optional[torch.Tensor] = None,
     ):
         kwargs = {}
         kwargs["device"] = qdata.device
@@ -97,19 +97,19 @@ def __init__(
         self,
         qdata: torch.Tensor,
         group_scale: torch.Tensor,
-        group_zero: Optional[torch.Tensor],
-        row_scale: Optional[torch.Tensor],
         block_size: List[int],
         shape: List[int],
+        group_zero: Optional[torch.Tensor] = None,
+        row_scale: Optional[torch.Tensor] = None,
     ):
         # one and only one of group_scale and group_zero should be None
         assert group_zero is None or row_scale is None
         assert not (group_zero is not None and row_scale is not None)
         self.qdata = qdata
-        self.group_scale = group_scale
-        self.group_zero = group_zero
         self.row_scale = row_scale
         self.block_size = block_size
+        self.group_scale = group_scale
+        self.group_zero = group_zero
 
     def _quantization_type(self):
         return f"shape={self.shape}, block_size={self.block_size}, device={self.device}"
@@ -178,10 +178,10 @@ def from_hp(
         return Int4PreshuffledTensor(
             qdata=wq,
             group_scale=group_scale,
-            group_zero=group_zero,
-            row_scale=row_scale,
             block_size=block_size,
             shape=original_shape,
+            group_zero=group_zero,
+            row_scale=row_scale,
         )
 
 
diff --git a/torchao/utils.py b/torchao/utils.py