AI-Hypercomputer · bhavya01 · Feb 14, 2025 · pgmoka · Feb 18, 2025 · zpcore
diff --git a/training/trillium/Llama3-70B-PyTorch/GCE/README.md b/training/trillium/Llama3-70B-PyTorch/GCE/README.md
@@ -28,17 +28,17 @@ gcloud alpha compute tpus tpu-vm create $TPU_NAME \
 
 The following setup runs the training job with Llama 3 70B on GCE TPUs using
 the docker image from this registry
-(`us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-xla/llama3-70b:jan15built`).
-The docker image uses torch and torch_xla nightly build from 09/28/2024
+(`us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-xla/llama3-70b:feb14build`).
+The docker image uses torch and torch_xla nightly build from 02/11/2024
 and comes with all the package dependency needed to run the model training.
 All the command below should run from your own machine (not the TPU host you
-created).
+created). The dockerfile used is to build the image is at https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile
 
 1. git clone and navigate to this README repo and run training script:
 
 ```bash
 git clone --depth 1 https://github.com/AI-Hypercomputer/tpu-recipes.git
-cd training/trillium/GCE/Llama3-70B-PyTorch
+cd training/trillium/Llama3-70B-PyTorch/GCE
 ```
 
 2. Edit `env.sh` to add the hugging face token and/or setup the training parameters.

diff --git a/training/trillium/Llama3-70B-PyTorch/GCE/host.sh b/training/trillium/Llama3-70B-PyTorch/GCE/host.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 
-DOCKER_IMAGE=us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-xla/llama3-70b:jan15built
-
+DOCKER_IMAGE=us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-xla/llama3-70b:feb14build
 worker_id=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/attributes/agent-worker-number" -H 'Metadata-Flavor: Google')
 
 cat >> /dev/null <<EOF

diff --git a/training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile b/training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile
@@ -1,6 +1,5 @@
 # Base package containing nightly PyTorch/XLA
-ARG BASE_IMAGE=us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm
-FROM ${BASE_IMAGE}
+FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_cxx11_20250211
 
 # Install transformers library
 ARG TRANSFORMERS_REPO=https://github.com/pytorch-tpu/transformers.git
@@ -10,7 +9,7 @@ RUN git clone "${TRANSFORMERS_REPO}" transformers && cd transformers && git chec
 
 # Install transformers dependencies
 WORKDIR /workspace/transformers
-RUN pip3 install git+file://$PWD accelerate datasets evaluate "huggingface_hub[cli]" \
+RUN pip3 install git+file://$PWD accelerate datasets protobuf evaluate "huggingface_hub[cli]" \
     "torch_xla[pallas]" \
     -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html \
     -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html