AI-Hypercomputer · anfals · Feb 22, 2025 · Feb 22, 2025 · anfals · Feb 22, 2025
@@ -67,7 +67,7 @@ def create_orbax_checkpoint_manager(
   p.mkdir(exist_ok=True, parents=True)
   # we need to use ocdbt and zarr3 to control max file size in the checkpoint
   # omitting `iter` uses default handler for `iter`
-  item_handlers = {"items": PyTreeCheckpointHandler(use_ocdbt=use_ocdbt, use_zarr3=use_zarr3)}
+  item_handlers = {"items": PyTreeCheckpointHandler(save_concurrent_gb=500, use_ocdbt=use_ocdbt, use_zarr3=use_zarr3)}
   mngr = CheckpointManager(
       p,
       item_names=item_names,
@@ -221,6 +221,7 @@ def map_to_pspec(data):
             single_replica_sharding=single_replica_sharding,
             global_shape=data.shape,
             dtype=data.dtype,
+            strict=False
         )
 
       if enable_single_replica_ckpt_restoring:
@@ -320,12 +321,20 @@ def load_params_from_path(load_parameters_from_path, abstract_unboxed_params):
   assert load_parameters_from_path, "load_parameters_from_path is not defined."
   max_logging.log(f"restoring params from {load_parameters_from_path}")
   ckpt = epath.Path(load_parameters_from_path)
-  ckptr = ocp.PyTreeCheckpointer()
+  # ckptr = ocp.Checkpointer(ocp.PyTreeCheckpointHandler(restore_concurrent_gb=500, save_concurrent_gb=500))
+  ckptr = ocp.Checkpointer(ocp.PyTreeCheckpointHandler())
   # This is a memory optimization. We don't want to restore the entire checkpoint - only the params.
   # Rather than pass the entire abstract state, which could unnecessarily restore opt_state and such and waste
   # memory, we instead specify here that we are just restoring the params field of the checkpoint
   # (which itself may be a dictionary containing a key named 'params').
   restore_args = ocp.checkpoint_utils.construct_restore_args(abstract_unboxed_params)
+  def update_restore_args(restore_args):
+    for value in restore_args.values():
+      if type(value) == ocp._src.serialization.type_handlers.ArrayRestoreArgs:
+        value.strict = False
+      elif type(value) == dict:
+        update_restore_args(value)
+  update_restore_args(restore_args)
   restored = ckptr.restore(
       ckpt, item={"params": abstract_unboxed_params}, transforms={}, restore_args={"params": restore_args}
   )

@@ -388,10 +388,10 @@ skip_jax_distributed_system: False # If True we will not initialize the jax dist
 # 2) Cosine decay from [learning_rate] to [learning_rate * cosine_learning_rate_final_fraction] from warmup to learning_rate_schedule_steps
 # 3) Constant learning rate of 0 from learning_rate_schedule_steps to steps.
 # The zero learning rate section can be used to more accurately measure the fully trained model's performance.
-learning_rate: 3.e-5
+learning_rate: 8.e-5
 cosine_learning_rate_final_fraction: 0.1
-warmup_steps_fraction: 0.1
-learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
+warmup_steps_fraction: 0.0067
+learning_rate_schedule_steps: 2400000 # By default the length of the schedule is set to the number of steps.
 # However you may choose a longer schedule (learning_rate_schedule_steps > steps), in which case the training will end before
 # dropping fully down. Or you may choose a shorter schedule, where the unspecified steps will have a learning rate of 0.
 

@@ -21,7 +21,7 @@ base_num_decoder_layers: 126
 base_mlp_dim: 53248
 head_dim: 128
 mlp_activations: ["silu","linear"]
-vocab_size: 128256
+vocab_size: 32000
 enable_dropout: False
 logits_via_embedding: False
 normalization_layer_epsilon: 1.0e-5

@@ -203,7 +203,7 @@ def save_checkpoint(
 ) -> bool:
   """Wrapper for saving checkpoint."""
   if config and config.enable_checkpointing:
-    if (step % config.checkpoint_period == 0) or (
+    if (step % config.checkpoint_period == 0 and step != 0) or (
         config.enable_emergency_checkpoint and step % config.local_checkpoint_period == 0
     ):
       blocking_until_ready_start = time.time()