Oneflow-Inc · ShawnXuan · Sep 5, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/libai/inference/basic.py b/libai/inference/basic.py
@@ -43,6 +43,7 @@ def __init__(
         pipeline_num_layers=None,
         model_path=None,
         mode="libai",
+        device="cuda",
         **kwargs,
     ):
         # init cfg
@@ -60,6 +61,9 @@ def __init__(
             pipeline_stage_id,
             pipeline_num_layers,
         )
+        self.device = device
+        if device:
+            self.cfg.train.dist.device_type = device
         dist.setup_dist_util(self.cfg.train.dist)
         logger.info(self.cfg.train.dist)
 
@@ -167,7 +171,9 @@ def to_local(self, model_outputs_dict):
         for key, value in model_outputs_dict.items():
             if isinstance(value, flow.Tensor) and value.is_global:
                 model_outputs_dict[key] = dist.ttol(
-                    value, ranks=[0] if value.placement.ranks.ndim == 1 else [[0]]
+                    value,
+                    device=self.device,
+                    ranks=[0] if value.placement.ranks.ndim == 1 else [[0]],
                 )
         if flow.cuda.is_available():
             dist.synchronize()

diff --git a/projects/Llama/readme.md → projects/Llama/README.md b/projects/Llama/readme.md → projects/Llama/README.md
@@ -44,4 +44,17 @@ python projects/Llama/utils/eval_adapter.py
 - Adjust the parameters in the `projects/Llama/pipeline.py`, and running:
 ```bash
 bash tools/infer.sh projects/Llama/pipeline.py 8
-```
+```
+
+## npu/xpu example
+
+- npu
+```bash
+python projects/Llama/pipeline.py --device=npu --mode=huggingface --config_file=projects/Llama/configs/llama_config_npu.py
+```
+
+- xpu
+```bash
+python projects/Llama/pipeline.py --device=xpu --mode=huggingface --config_file=projects/Llama/configs/llama_config_xpu.py
+```
+
diff --git a/projects/Llama/configs/llama_config_npu.py b/projects/Llama/configs/llama_config_npu.py
@@ -0,0 +1,64 @@
+from omegaconf import DictConfig, OmegaConf
+
+from libai.config import LazyCall
+from projects.Llama.llama import LlamaForCausalLM
+from projects.Llama.tokenizer import LlamaTokenizer
+from configs.common.train import train
+
+import oneflow_npu
+
+cfg = dict(
+    # Model
+    hidden_act="silu",
+    hidden_size=4096,
+    initializer_range=0.02,
+    intermediate_size=11008,
+    max_position_embeddings=2048,
+    num_attention_heads=32,
+    hidden_layers=32,
+    pretraining_tp=1,
+    rms_norm_eps=1e-05,
+    rope_scaling=None,
+    tie_word_embeddings=False,
+    vocab_size=32000,
+    use_scaled_init_for_output_weights=False,
+    scale_mask_softmax_fusion=False,
+    amp_enabled=True,
+    # Inference
+    is_encoder_decoder=False,
+    max_length=256,
+    min_length=0,
+    do_sample=False,
+    early_stopping=False,
+    num_beams=1,
+    num_beam_groups=1,
+    diversity_penalty=0.0,
+    temperature=0.9,
+    top_k=50,
+    top_p=0.6,
+    typical_p=1.0,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    no_repeat_ngram_size=0,
+    encoder_no_repeat_ngram_size=0,
+    num_return_sequences=1,
+    chunk_size_feed_forward=0,
+    output_scores=False,
+    use_cache=True,
+    bos_token_id=1,
+    eos_token_id=2,
+    pad_token_id=0,
+    # train
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf",
+    pretrained_model_path="/data0/hf_models/Llama-2-7b-chat-hf",
+)
+
+cfg = DictConfig(cfg)
+
+model = LazyCall(LlamaForCausalLM)(cfg=cfg)
+tokenization = OmegaConf.create()
+tokenization.make_vocab_size_divisible_by = 1
+tokenization.tokenizer = LazyCall(LlamaTokenizer)(
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    pretrained_model_path="/data0/hf_models/Llama-2-7b-chat-hf/tokenizer.model"
+)
diff --git a/projects/Llama/configs/llama_config_xpu.py b/projects/Llama/configs/llama_config_xpu.py
@@ -0,0 +1,64 @@
+from omegaconf import DictConfig, OmegaConf
+
+from libai.config import LazyCall
+from projects.Llama.llama import LlamaForCausalLM
+from projects.Llama.tokenizer import LlamaTokenizer
+from configs.common.train import train
+
+import oneflow_xpu
+
+cfg = dict(
+    # Model
+    hidden_act="silu",
+    hidden_size=4096,
+    initializer_range=0.02,
+    intermediate_size=11008,
+    max_position_embeddings=2048,
+    num_attention_heads=32,
+    hidden_layers=32,
+    pretraining_tp=1,
+    rms_norm_eps=1e-05,
+    rope_scaling=None,
+    tie_word_embeddings=False,
+    vocab_size=32000,
+    use_scaled_init_for_output_weights=False,
+    scale_mask_softmax_fusion=False,
+    amp_enabled=True,
+    # Inference
+    is_encoder_decoder=False,
+    max_length=256,
+    min_length=0,
+    do_sample=False,
+    early_stopping=False,
+    num_beams=1,
+    num_beam_groups=1,
+    diversity_penalty=0.0,
+    temperature=0.9,
+    top_k=50,
+    top_p=0.6,
+    typical_p=1.0,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    no_repeat_ngram_size=0,
+    encoder_no_repeat_ngram_size=0,
+    num_return_sequences=1,
+    chunk_size_feed_forward=0,
+    output_scores=False,
+    use_cache=True,
+    bos_token_id=1,
+    eos_token_id=2,
+    pad_token_id=0,
+    # train
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf",
+    pretrained_model_path="/root/models/Llama-2-7b-chat-hf",
+)
+
+cfg = DictConfig(cfg)
+
+model = LazyCall(LlamaForCausalLM)(cfg=cfg)
+tokenization = OmegaConf.create()
+tokenization.make_vocab_size_divisible_by = 1
+tokenization.tokenizer = LazyCall(LlamaTokenizer)(
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    pretrained_model_path="/root/models/Llama-2-7b-chat-hf/tokenizer.model"
+)
diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import click
+
 from libai.inference.basic import BasePipeline
 from libai.utils import distributed as dist
 
@@ -67,7 +69,7 @@ def _parse_parameters(self, **pipeline_parameters):
 
     def preprocess(self, inputs, **kwargs) -> dict:
         # tokenizer encoderW
-        inputs = self.tokenizer.tokenize(inputs, add_bos=True, padding=True)
+        inputs = self.tokenizer.tokenize(inputs, add_bos=True, padding=True, device=self.device)
         inputs = {
             "input_ids": inputs,
         }
@@ -87,31 +89,37 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
         return records
 
 
-if __name__ == "__main__":
-    # ----- load huggingface checkpoint -----
-    # pipeline = TextGenerationPipeline(
-    #     "projects/Llama/configs/llama_config.py",
-    #     data_parallel=1,
-    #     tensor_parallel=1,
-    #     pipeline_parallel=1,
-    #     pipeline_num_layers=32,
-    #     model_path="",
-    #     mode="huggingface",
-    # )
-
-    # output = pipeline(inputs=text)
-    # if dist.is_main_process():
-    #     print(output)
-
-    # ----- load libai checkpoint -----
+@click.command()
+@click.option(
+    "--config_file",
+    default="projects/Llama/configs/llama_config.py",
+    help="Path to the configuration file.",
+)
+@click.option("--model_path", default="", help="Path to the model checkpoint.")
+@click.option(
+    "--mode",
+    default="libai",
+    help="Mode for the dataloader pipeline, e.g., 'libai' or 'huggingface'.",
+)
+@click.option(
+    "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'."
+)
+def main(config_file, model_path, mode, device):
+    if model_path:
+        print(
+            "Note: The '--model_path' option is for the model checkpoint only. "
+            "Please configure 'tokenization.tokenizer.pretrained_model_path' "
+            "directly in the config file."
+        )
     pipeline = TextGenerationPipeline(
-        "projects/Llama/configs/llama_config.py",
+        config_file,
         data_parallel=1,
         tensor_parallel=1,
         pipeline_parallel=1,
         pipeline_num_layers=32,
-        model_path="",
-        mode="libai",
+        model_path=model_path,
+        mode=mode,
+        device=device,
     )
 
     text = [
@@ -120,3 +128,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
     output = pipeline(inputs=text)
     if dist.is_main_process():
         print(output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/Llama/tokenizer.py b/projects/Llama/tokenizer.py
@@ -75,9 +75,9 @@ def tokenize(
         if add_eos:
             tokens = [token + [self.eos_token_id] for token in tokens]
 
-        if device == "cuda":
+        if device:
             sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
-            placement = kwargs.get("placement", flow.placement("cuda", [0]))
+            placement = kwargs.get("placement", flow.placement(device, [0]))
             return_token_ids = flow.tensor(tokens, sbp=sbp, placement=placement, dtype=flow.long)
         else:
             return_token_ids = flow.tensor(tokens, dtype=flow.long)

diff --git a/projects/Llama/utils/llama_loader.py b/projects/Llama/utils/llama_loader.py
@@ -26,6 +26,8 @@ def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
 
         self.base_model_prefix_1 = "model"
         self.base_model_prefix_2 = "model"
+        if not pretrained_model_path:
+            self.pretrained_model_path = libai_cfg.pretrained_model_path
 
     def _convert_state_dict(self, flow_state_dict, cfg):
         """Convert state_dict's keys to match model.
@@ -104,3 +106,5 @@ class LlamaLoaderLiBai(ModelLoaderLiBai):
     def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
         super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
         self.base_model_prefix_2 = "model"
+        if not pretrained_model_path:
+            self.pretrained_model_path = libai_cfg.pretrained_model_path