Update README

yanbing-j · yanbing-j · commit cc1f6cd7fed9 · 2024-07-03T02:09:26.000-04:00
diff --git a/README.md b/README.md
@@ -165,7 +165,7 @@ python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode in
 
 To run with int4, just pass the int4 checkpoint to generate.py.
 ```bash
-python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile
+python generate.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.pth --compile --device $DEVICE
 ```
 
 ## Speculative Sampling
diff --git a/quantize.py b/quantize.py
@@ -402,12 +402,7 @@ def __init__(self, mod, groupsize=128, inner_k_tiles=8, padding=True):
         assert inner_k_tiles in [2, 4, 8]
 
     @torch.no_grad()
-    def create_quantized_state_dict(self, use_cuda = True):
-        if use_cuda and torch.cuda.is_available():
-            device="cuda"
-        else:
-            device="cpu"
-
+    def create_quantized_state_dict(self):
         cur_state_dict = self.mod.state_dict()
         for fqn, mod in self.mod.named_modules():
             if isinstance(mod, torch.nn.Linear):
@@ -430,7 +425,7 @@ def create_quantized_state_dict(self, use_cuda = True):
                             "and that groupsize and inner_k_tiles*16 evenly divide into it")
                         continue
                 weight_int4pack, scales_and_zeros = prepare_int4_weight_and_scales_and_zeros(
-                    weight.to(torch.bfloat16).to(device=device), self.groupsize, self.inner_k_tiles
+                    weight.to(torch.bfloat16), self.groupsize, self.inner_k_tiles
                 )
                 cur_state_dict[f"{fqn}.weight"] = weight_int4pack.to('cpu')
                 cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to('cpu')