aliyun · 1195343015 · Feb 2, 2025 · Feb 14, 2025
diff --git a/README.md b/README.md
@@ -140,7 +140,7 @@ For the `Megatron parallel framework`, you can quickly start using the scripts/m
 ```bash
 sh scripts/megatron_gpt.sh \
 --nnodes 1 --node_rank 0 --nproc_per_node 8 --master_addr localhost --master_port 29500 \
--m 7 --world_size 8 --tensor_model_parallel_size 2 --pipeline_model_parallel 1 \
+-m 7 --world_size 8 --tensor_model_parallel_size 2 --pipeline_model_parallel_size 1 \
 --frame Megatron --global_batch 16  \
 --micro_batch 1 --seq_length 2048 --swiglu --use_flash_attn --aiob_enable
 ```
@@ -150,7 +150,7 @@ For `Moe` , you can quickly start it using the [scripts/megatron_gpt.sh](scripts
 ```bash
 sh scripts/megatron_gpt.sh \
 --nnodes 1 --node_rank 0 --nproc_per_node 8 --master_addr localhost --master_port 29500 \
--m moe --world_size 8 --tensor_model_parallel_size 4 --pipeline_model_parallel 1 \
+-m moe --world_size 8 --tensor_model_parallel_size 4 --pipeline_model_parallel_size 1 \
 --moe_enable --expert_model_parallel_size 1  \
 --frame Megatron --global_batch 16  \
 --num_experts 4 --moe_router_topk 2 \
@@ -177,7 +177,7 @@ Note that the computation times are obtained through the execution of computatio
 The following commands does not generate the computation descrition file, but also run the workload in the real GPU cluster.
 ```bash
 sh scripts/megatron_gpt.sh \
--m 7 --world_size 8 --tensor_model_parallel_size 2 --pipeline_model_parallel 1 \
+-m 7 --world_size 8 --tensor_model_parallel_size 2 --pipeline_model_parallel_size 1 \
 --frame Megatron --global_batch 16  \
 --micro_batch 1 --seq_length 2048 \
 --swiglu --use_flash_attn  --aiob_enable 
@@ -187,7 +187,7 @@ Users can defined their own computation times or directly use the files we provi
 By specifying the computation description file with the `--comp_filepath` option, you can embed computation times before running the workload on a physical machine.
 ```bash
 sh scripts/megatron_gpt.sh \
--m 7 --world_size 8 --tensor_model_parallel_size 2 --pipeline_model_parallel 1 \
+-m 7 --world_size 8 --tensor_model_parallel_size 2 --pipeline_model_parallel_size 1 \
 --frame Megatron --global_batch 16  --micro_batch 1 \
 --seq_length 2048 --swiglu --use_flash_attn  \
 --aiob_enable  \
@@ -206,7 +206,7 @@ Here, you can use the script [scripts/megatron_workload.sh](scripts/megatron_wor
 ```bash
 sh ./scripts/megatron_workload_with_aiob.sh \
 -m 7 --world_size 4096 \
---tensor_model_parallel_size 2 --pipeline_model_parallel 1 \
+--tensor_model_parallel_size 2 --pipeline_model_parallel_size 1 \
 --frame Megatron --global_batch 8192 \
 --micro_batch 1 --seq_length 4096 \
 --swiglu --use_flash_attn  --aiob_enable
@@ -215,7 +215,7 @@ sh ./scripts/megatron_workload_with_aiob.sh \
 
 ```bash
 sh ./scripts/megatron_workload_with_aiob.sh -m 7 \
---world_size 4096 --tensor_model_parallel_size 2 --pipeline_model_parallel 1 \
+--world_size 4096 --tensor_model_parallel_size 2 --pipeline_model_parallel_size 1 \
 --frame Megatron --global_batch 8192 \
 --micro_batch 1 --seq_length 4096 --swiglu \
 --use_flash_attn  --aiob_enable \
@@ -225,7 +225,7 @@ sh ./scripts/megatron_workload_with_aiob.sh -m 7 \
 For the Moe, you can also use [scripts/megatron_workload_with_aiob.sh](scripts/workload_megatron.sh) to generate the corresponding model's workload file. 
 ```bash
 sh scripts/megatron_workload_with_aiob.sh \
--m moe --world_size 512 --tensor_model_parallel_size 2 --pipeline_model_parallel 1 --sp  --ep 16 \
+-m moe --world_size 512 --tensor_model_parallel_size 2 --pipeline_model_parallel_size 1 --sp  --ep 16 \
 --num_experts 64 --moe_router_topk 2 --moe_grouped_gemm --moe_enable  \
 --frame Megatron --global_batch 1024  \
 --micro_batch 1 --seq_length 4096 --swiglu \

diff --git a/aicb.py b/aicb.py
@@ -57,15 +57,15 @@
             else:
                 filepath = get_aiob_path(args)
             torch.distributed.barrier()
-            compute_cache = extract_averages(filepath,args)
+            compute_cache = extract_averages(filepath, args)
         else:
             print("comp_filepath:", args.comp_filepath)
-            compute_cache = extract_averages(args.comp_filepath,args)
+            compute_cache = extract_averages(args.comp_filepath, args)
         workload = Comp_with_aiob(workload, compute_cache)
     if torch.distributed.get_rank() == 0:
         filename = f"{workload_generator.name}_{args.model_name}_sp_{args.enable_sequence_parallel}_iteration_{args.epoch_num}_computationEnable_{args.computation_enable}_{args.world_size}n.csv"
         workload.dump(filename)
-    if not args.workload_only :
+    if not args.workload_only:
         applyer = WorkloadApplyer(workload=workload, args=args)
         cpu_time = applyer.apply_workload()
         if torch.distributed.get_rank() == 0:
@@ -76,7 +76,7 @@
             if args.enable_visual:
                 try:
                     from visualize.generate import visualize_output
-                    visualize_output(csv_filename,False)
+                    visualize_output(csv_filename, False)
                 except ImportError: 
                     print("visualize_output is not available because required library is not found")
 

diff --git a/log_analyzer/log.py b/log_analyzer/log.py
@@ -227,25 +227,26 @@ def _get_elapsed_time(self):
         return self.epoch_times
 
     def analyze_time(self, print_fn=print):
-        self.epoch_times.pop(0)
-        max_val = max(self.epoch_times)
-        min_val = min(self.epoch_times)
-        mean_val = sum(self.epoch_times) / len(self.epoch_times)
-
-        variance = sum((x - mean_val) ** 2 for x in self.epoch_times) / len(
-            self.epoch_times
-        )
-        variance = math.sqrt(variance)
+        if self.epoch_times:
+            self.epoch_times.pop(0)
+            max_val = max(self.epoch_times)
+            min_val = min(self.epoch_times)
+            mean_val = sum(self.epoch_times) / len(self.epoch_times)
+
+            variance = sum((x - mean_val) ** 2 for x in self.epoch_times) / len(
+                self.epoch_times
+            )
+            variance = math.sqrt(variance)
 
-        sorted_list = sorted(self.epoch_times)
-        p90_val = sorted_list[int(len(sorted_list) * 0.9)]
-        p99_val = sorted_list[int(len(sorted_list) * 0.99)]
-        header = f"{'Init time':<18} {'Max iteration time':<20} {'Min iteration time':<20} {'Avg iteration time':<20} {'P90 iteration time ':<20} {'Iteration time Std ':<20}\n"
-        separator = "-" * len(header) + "\n"
-        log_str = separator + header + separator 
-        iteration_result = f"{self.epoch_times[0]:<18.2f} {max_val:<20.2f} {min_val:<20.2f} {mean_val:<20.2f} {p90_val:<20.2f} {variance:<20.2f}\n"
-        log_str += iteration_result
-        print_fn(f"\n\tDetailed info for AICB iteration time\n{log_str}")
+            sorted_list = sorted(self.epoch_times)
+            p90_val = sorted_list[int(len(sorted_list) * 0.9)]
+            p99_val = sorted_list[int(len(sorted_list) * 0.99)]
+            header = f"{'Init time':<18} {'Max iteration time':<20} {'Min iteration time':<20} {'Avg iteration time':<20} {'P90 iteration time ':<20} {'Iteration time Std ':<20}\n"
+            separator = "-" * len(header) + "\n"
+            log_str = separator + header + separator 
+            iteration_result = f"{self.epoch_times[0]:<18.2f} {max_val:<20.2f} {min_val:<20.2f} {mean_val:<20.2f} {p90_val:<20.2f} {variance:<20.2f}\n"
+            log_str += iteration_result
+            print_fn(f"\n\tDetailed info for AICB iteration time\n{log_str}")
 
 
 class Workload:
@@ -254,7 +255,8 @@ def __init__(self) -> None:
 
     def append(self, log_item: Union[LogItem, Dict]):
         if isinstance(log_item, LogItem):
-            self.workload.append(log_item)
+            if log_item.comm_group_size != 1:
+                self.workload.append(log_item)
             return
         if "stage" not in log_item:
             log_item["stage"] = log_item["operation"] if "operation" in log_item else ""

diff --git a/run_suites.py b/run_suites.py
@@ -76,11 +76,11 @@ def read_config(config):
         )
     if int(megatron_conf["gpt_175B"]):
         running_command["megatron_gpt175B"] = (
-            f"bash scripts/megatron_gpt.sh -m 175 --tensor_model_parallel_size 8 --epoch_num 10 --pipeline_model_parallel 2 --sp"
+            f"bash scripts/megatron_gpt.sh -m 175 --tensor_model_parallel_size 8 --epoch_num 10 --pipeline_model_parallel_size 2 --sp"
         )
     if int(megatron_conf["gpt_175B_tp"]):
         running_command["megatron_gpt175B_tp"] = (
-            f"bash scripts/megatron_gpt.sh -m 175 --tensor_model_parallel_size 8 --epoch_num 10 --pipeline_model_parallel 2"
+            f"bash scripts/megatron_gpt.sh -m 175 --tensor_model_parallel_size 8 --epoch_num 10 --pipeline_model_parallel_size 2"
         )
     if int(megatron_conf["gpt_22B"]):
         running_command["megatron_gpt_22B"] = (
@@ -104,11 +104,11 @@ def read_config(config):
         )
     if int(aiob_conf["gpt_175B_aiob"]):
         running_command["megatron_gpt175B_aiob"] = (
-            f"bash scripts/megatron_gpt.sh -m 175 --tensor_model_parallel_size 8 --epoch_num 10 --aiob_enable --pipeline_model_parallel 2 --sp"
+            f"bash scripts/megatron_gpt.sh -m 175 --tensor_model_parallel_size 8 --epoch_num 10 --aiob_enable --pipeline_model_parallel_size 2 --sp"
         )
     if int(aiob_conf["gpt_175B_tp_aiob"]):
         running_command["megatron_gpt175B_tp_aiob"] = (
-            f"bash scripts/megatron_gpt.sh -m 175 --tensor_model_parallel_size 8 --epoch_num 10 --aiob_enable --pipeline_model_parallel 2 "
+            f"bash scripts/megatron_gpt.sh -m 175 --tensor_model_parallel_size 8 --epoch_num 10 --aiob_enable --pipeline_model_parallel_size 2 "
         )
     if int(aiob_conf["gpt_22B_aiob"]):
         running_command["megatron_gpt_22B_aiob"] = (

diff --git a/scripts/megatron_gpt.sh b/scripts/megatron_gpt.sh
@@ -14,7 +14,8 @@ seq_length=2048
 micro_batch=1
 epoch_num=1
 tensor_model_parallel_size=8
-pipeline_model_parallel=1
+pipeline_model_parallel_size=1
+context_parallel_size=1
 vocab_size=50257
 model_name=gpt_13b
 ga_num=2
@@ -32,7 +33,8 @@ usage() {
       --frame              Communication framework: $frame
       --world_size              World size (number of nodes): $WORLD_SIZE
       --tensor_model_parallel_size                  Tensor parallelism size: $tensor_model_parallel_size
-      --pipeline_model_parallel                  Pipeline parallelism size: $pipeline_model_parallel
+      --pipeline_model_parallel_size                  Pipeline parallelism size: $pipeline_model_parallel_size
+      --context_parallel_size                  Context parallelism size: $context_parallel_size
       --global_batch            Global batch size: $global_batch
       --micro_batch             Micro batch size: $micro_batch
       --num_layers              Number of layers: $num_layers
@@ -72,8 +74,10 @@ echo "Processing argument: $1"
       world_size=$2; shift;;
     --tensor_model_parallel_size|tp_num)
       tensor_model_parallel_size=$2; shift;;
-    --pipeline_model_parallel|pp_num)
-      pipeline_model_parallel=$2; shift;;
+    --pipeline_model_parallel_size|pp_num)
+      pipeline_model_parallel_size=$2; shift;;
+    --context_parallel_size|cp_num)
+      context_parallel_size=$2; shift;;
     --global_batch)
       global_batch=$2; shift;;
     --micro_batch)
@@ -169,7 +173,7 @@ case $model_size in
     ffn_hidden_size=53248
     num_attention_heads=128
     tensor_model_parallel_size=8
-    pipeline_model_parallel=16
+    pipeline_model_parallel_size=16
     ;;
   65)
     model_name=llama_65B
@@ -178,7 +182,7 @@ case $model_size in
     ffn_hidden_size=28672
     num_attention_heads=64
     tensor_model_parallel_size=8
-    pipeline_model_parallel=2
+    pipeline_model_parallel_size=2
     ;;
   moe)
     model_name=Mixtral_8*7B
@@ -199,8 +203,8 @@ case $model_size in
     ;;
 esac
 
-dp_num=$((world_size/tensor_model_parallel_size/pipeline_model_parallel))
-global_batch=$((ga_num*dp_num*micro_batch))
+data_parallel_size=$((world_size/tensor_model_parallel_size/pipeline_model_parallel_size))
+global_batch=$((ga_num*data_parallel_size*micro_batch))
 if [ $workload_only ]; then
   script="python -m workload_generator.generate_megatron_workload" 
 else
@@ -220,7 +224,8 @@ cmd="$script \
   --num_attention_heads=$num_attention_heads \
   --seq_length=$seq_length \
   --vocab_size=$vocab_size \
-  --pipeline_model_parallel=$pipeline_model_parallel \
+  --pipeline_model_parallel_size=$pipeline_model_parallel_size \
+  --context-parallel-size=$context_parallel_size \
   --use-distributed-optimizer \
   --max_position_embeddings=$max_position_embeddings \
   ${aiob_enable} \

diff --git a/scripts/megatron_workload_with_aiob.sh b/scripts/megatron_workload_with_aiob.sh
@@ -4,7 +4,8 @@
 frame=Megatron
 world_size=32
 tensor_model_parallel_size=8
-pipeline_model_parallel=1
+pipeline_model_parallel_size=1
+context_parallel_size=1
 global_batch=1024
 micro_batch=1
 num_layers=40
@@ -31,7 +32,8 @@ usage() {
       --frame              communication framework, defaults to $frame
       --world_size              world size, defaults to $world_size
       --tensor_model_parallel_size                  tensor parallelism size, defaults to $tensor_model_parallel_size
-      --pipeline_model_parallel                  pipeline parallelism size, defaults to $pipeline_model_parallel
+      --pipeline_model_parallel_size                  pipeline parallelism size, defaults to $pipeline_model_parallel_size
+      --context_parallel_size                  context parallelism size, defaults to $context_parallel_size
       --global_batch            global batch size, defaults to $global_batch
       --micro_batch             micro batch size, defaults to $micro_batch
       --num_layers              number of layers, defaults to $num_layers
@@ -68,8 +70,10 @@ do
       world_size=$2; shift;;
     --tensor_model_parallel_size|--tp)
       tensor_model_parallel_size=$2; shift;;
-    --pipeline_model_parallel|--pp)
-      pipeline_model_parallel=$2; shift;;
+    --pipeline_model_parallel_size|--pp)
+      pipeline_model_parallel_size=$2; shift;;
+    --context_parallel_size|--cp)
+      context_parallel_size=$2; shift;;
     --global_batch)
       global_batch=$2; shift;;
     --micro_batch)
@@ -181,7 +185,8 @@ cmd="python -m workload_generator.AIOB_simAI_workload_generator \
   --frame=$frame \
   --world_size=$world_size \
   --tensor_model_parallel_size=$tensor_model_parallel_size \
-  --pipeline_model_parallel=$pipeline_model_parallel \
+  --pipeline_model_parallel_size=$pipeline_model_parallel_size \
+  --context-parallel-size=$context_parallel_size \
   --global_batch=$global_batch \
   --micro_batch=$micro_batch \
   --num_layers=$num_layers \

diff --git a/training/tutorial.md b/training/tutorial.md
@@ -45,7 +45,7 @@ export WORLD_SIZE=1
 export RANK=0
 
 sh ./scripts/megatron_gpt.sh \
--m 13 --world_size 8 --tensor_model_parallel_size 8 --pipeline_model_parallel 1 \
+-m 13 --world_size 8 --tensor_model_parallel_size 8 --pipeline_model_parallel_size 1 \
 --frame Megatron --global_batch 2  \
 --micro_batch 1 --seq_length 4096 \
 --swiglu --use_flash_attn  --aiob_enable  
@@ -107,7 +107,7 @@ We provide four pre-existing models (7/13/22/175)B to quickly generate the corre
 Below is an example of generating a Workload with a model size of 7B, tp 4, pp 1, a total GPU count of 4096, gbs 8192, mbs 1, sequence length of 4096, with flash_attn, swiglu, and aiob enabled, and reading Example.txt as the computation time.
 ```bash
 sh ./scripts/megatron_workload_with_aiob.sh -m 7 \
---world_size 4096 --tensor_model_parallel_size 4 --pipeline_model_parallel 1 \
+--world_size 4096 --tensor_model_parallel_size 4 --pipeline_model_parallel_size 1 \
 --frame Megatron --global_batch 8192 \
 --micro_batch 1 --seq_length 4096 --swiglu \
 --use_flash_attn  --aiob_enable \
@@ -140,7 +140,7 @@ The main parameters for AICB are as follows:
 |                              | max_position_embeddings           | Maximum number of position embeddings to use.                               |
 |                              | ffn_hidden_size                   | Transformer Feed-Forward Network hidden size.                               |
 | Megatron parallel parameters | tensor_model_parallel_size        | Degree of tensor model parallelism.                                         |
-|                              | pipeline_model_parallel           | Degree of pipeline model parallelism.                                       |
+|                              | pipeline_model_parallel_size      | Degree of pipeline model parallelism.                                       |
 |                              | enable_sequence_parallel          | Enable sequence parallel optimization.                                      |
 | Megatron optimization parameters | use_flash_attn                | Use FlashAttention implementation of attention.                             |
 |                              | swiglu                            | Use gated linear units and SiLU activation instead of default gelu          |
@@ -210,7 +210,7 @@ Here is an example:
 ```bash
 python -m workload_generator.AIOB_simAI_workload_generator \
   --model_name GPT-13B --frame=Megatron \
-  --world_size=16 --tensor_model_parallel_size=2 --pipeline_model_parallel=1 --global_batch=16 \
+  --world_size=16 --tensor_model_parallel_size=2 --pipeline_model_parallel_size=1 --global_batch=16 \
   --micro_batch=1   --num_layers=40 --seq_length=2048 \
   --hidden_size=5120 --epoch_num=1 \
   --use-distributed-optimizer --num_attention_heads=40 \
@@ -288,7 +288,7 @@ Here is a brief example of training process and workload item:
 ```python
 trainer.init()
 for _ in range(epoch_num):
-    if pipeline_model_parallel > 1:
+    if pipeline_model_parallel_size > 1:
         trainer.with_pipeline_forward_backward()
     else:
         for _ in range(num_microbatches):