diff --git a/.github/scripts/install-cuda-aarch64.sh b/.github/scripts/install-cuda-aarch64.sh
index 8301b333e4..3f0b3b2178 100755
--- a/.github/scripts/install-cuda-aarch64.sh
+++ b/.github/scripts/install-cuda-aarch64.sh
@@ -3,12 +3,16 @@ install_cuda_aarch64() {
     echo "install cuda ${CU_VERSION}"
     # CU_VERSION: cu128 --> CU_VER: 12-8
     CU_VER=${CU_VERSION:2:2}-${CU_VERSION:4:1}
+    # CU_VERSION: cu129 --> CU_DOT_VER: 12.9
+    CU_DOT_VER=${CU_VERSION:2:2}.${CU_VERSION:4:1}
     dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
+    # nccl version must match libtorch_cuda.so was built with https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt
     dnf -y install cuda-compiler-${CU_VER}.aarch64 \
                    cuda-libraries-${CU_VER}.aarch64 \
-                   cuda-libraries-devel-${CU_VER}.aarch64
+                   cuda-libraries-devel-${CU_VER}.aarch64 \
+                   libnccl-2.27.3-1+cuda${CU_DOT_VER} libnccl-devel-2.27.3-1+cuda${CU_DOT_VER} libnccl-static-2.27.3-1+cuda${CU_DOT_VER}
     dnf clean all
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib64:$LD_LIBRARY_PATH
     ls -lart /usr/local/
     nvcc --version
     echo "cuda ${CU_VER} installed successfully"
diff --git a/.github/scripts/install-torch-tensorrt.sh b/.github/scripts/install-torch-tensorrt.sh
index 109b9565f3..c567a6f172 100755
--- a/.github/scripts/install-torch-tensorrt.sh
+++ b/.github/scripts/install-torch-tensorrt.sh
@@ -2,7 +2,7 @@
 set -x
 
 TORCH=$(grep "^torch>" ${PWD}/py/requirements.txt)
-TORCHVISION=$(grep "^torchvision" ${PWD}/py/requirements.txt)
+TORCHVISION=$(grep "^torchvision>" ${PWD}/tests/py/requirements.txt)
 INDEX_URL=https://download.pytorch.org/whl/${CHANNEL}/${CU_VERSION}
 PLATFORM=$(python -c "import sys; print(sys.platform)")
 
@@ -14,8 +14,12 @@ fi
 
 # Install all the dependencies required for Torch-TensorRT
 pip install --pre -r ${PWD}/tests/py/requirements.txt
+# dependencies in the tests/py/requirements.txt might install a different version of torch or torchvision
+# eg. timm will install the latest torchvision, however we want to use the torchvision from nightly
+# reinstall torch torchvisionto make sure we have the correct version
+pip uninstall -y torch torchvision
+pip install --force-reinstall --pre ${TORCHVISION} --index-url ${INDEX_URL} --no-deps
 pip install --force-reinstall --pre ${TORCH} --index-url ${INDEX_URL}
-pip install --force-reinstall --pre ${TORCHVISION} --index-url ${INDEX_URL}
 
 
 # Install Torch-TensorRT
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
index f788f88dc9..51f3730d02 100644
--- a/.github/workflows/build-test-linux-x86_64.yml
+++ b/.github/workflows/build-test-linux-x86_64.yml
@@ -138,22 +138,7 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py
-        major=${PYTHON_VERSION%%.*}
-        minor=${PYTHON_VERSION#*.}
-        minor=${minor%%.*}
-        if (( major > 3 || (major == 3 && minor >= 13) )); then
-          echo "flashinfer-python is not supported for python version 3.13 or higher"
-        else
-          echo "Installing flashinfer-python"
-          # flashinfer-python is broken on python 3.9 at the moment, so we skip it for now
-          if (major == 3 && minor == 9); then
-            echo "Skipping flashinfer-python for python 3.9"
-          else
-            python -m pip install flashinfer-python --no-deps
-          fi
-        fi
-        cd dynamo
+        cd tests/py/dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml automatic_plugin/test_automatic_plugin.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml automatic_plugin/test_automatic_plugin_with_attrs.py
diff --git a/.github/workflows/build-test-tensorrt-linux.yml b/.github/workflows/build-test-tensorrt-linux.yml
index 0cf9eab4fd..81814ea719 100644
--- a/.github/workflows/build-test-tensorrt-linux.yml
+++ b/.github/workflows/build-test-tensorrt-linux.yml
@@ -94,6 +94,7 @@ jobs:
       build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
@@ -130,6 +131,7 @@ jobs:
       build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -160,6 +162,7 @@ jobs:
       build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -190,6 +193,7 @@ jobs:
       build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -220,6 +224,7 @@ jobs:
       build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -252,6 +257,7 @@ jobs:
       build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -284,6 +290,7 @@ jobs:
       build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -316,6 +323,7 @@ jobs:
       build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }}
       pre-script: ${{ matrix.pre-script }}
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
diff --git a/.github/workflows/build-test-tensorrt-windows.yml b/.github/workflows/build-test-tensorrt-windows.yml
index 0c847a459c..427b689656 100644
--- a/.github/workflows/build-test-tensorrt-windows.yml
+++ b/.github/workflows/build-test-tensorrt-windows.yml
@@ -101,6 +101,7 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -133,6 +134,7 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -160,6 +162,7 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -187,6 +190,7 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -214,6 +218,7 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -243,6 +248,7 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -272,6 +278,7 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
@@ -300,6 +307,7 @@ jobs:
       build-matrix: ${{ needs.substitute-runner.outputs.matrix }}
       pre-script: packaging/driver_upgrade.bat
       script: |
+        set -euo pipefail
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
diff --git a/.github/workflows/windows-test.yml b/.github/workflows/windows-test.yml
index a8b27c0aa9..8dc1b107d3 100644
--- a/.github/workflows/windows-test.yml
+++ b/.github/workflows/windows-test.yml
@@ -39,7 +39,10 @@ on:
         description: "Prevents a job from failing when a step fails. Set to true to allow a job to pass when exec script step fails."
         default: false
         type: boolean
-
+      architecture:
+        description: 'CPU architecture to build for'
+        default: "x64"
+        type: string
 jobs:
   test:
     strategy:
@@ -107,7 +110,7 @@ jobs:
         if: ${{ matrix.tensorrt == '' }}
         uses: actions/download-artifact@v4
         with:
-          name: ${{ env.ARTIFACT_NAME }}
+          name: ${{ env.ARTIFACT_NAME }}${{ inputs.architecture }}
           path: ${{ runner.temp }}/artifacts/
       - name: Download artifacts
         if: ${{ matrix.tensorrt != '' }}
diff --git a/MODULE.bazel b/MODULE.bazel
index a22e70f071..1ecaebba28 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -66,7 +66,7 @@ http_archive(
     name = "libtorch",
     build_file = "@//third_party/libtorch:BUILD",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/nightly/cu128/libtorch-cxx11-abi-shared-with-deps-latest.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu128/libtorch-shared-with-deps-latest.zip"],
 )
 
 # in aarch64 platform you can get libtorch via either local or wheel file
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index 707edcd718..1f1a2120a9 100755
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -66,14 +66,6 @@ if [[ ${TENSORRT_VERSION} != "" ]]; then
          pyproject.toml
 fi
 
-if [[ "${CU_VERSION::4}" < "cu12" ]]; then
-  # replace dependencies from tensorrt-cu12-bindings/libs to tensorrt-cu11-bindings/libs
-  sed -i -e "s/tensorrt-cu12/tensorrt-${CU_VERSION::4}/g" \
-         -e "s/tensorrt-cu12-bindings/tensorrt-${CU_VERSION::4}-bindings/g" \
-         -e "s/tensorrt-cu12-libs/tensorrt-${CU_VERSION::4}-libs/g" \
-         pyproject.toml
-fi
-
 cat toolchains/ci_workspaces/MODULE.bazel.tmpl | envsubst > MODULE.bazel
 
 if [[ ${TENSORRT_VERSION} != "" ]]; then
diff --git a/packaging/pre_build_script_windows.sh b/packaging/pre_build_script_windows.sh
index 7d089b602a..b5b62ebf05 100644
--- a/packaging/pre_build_script_windows.sh
+++ b/packaging/pre_build_script_windows.sh
@@ -17,14 +17,6 @@ if [[ ${TENSORRT_VERSION} != "" ]]; then
          pyproject.toml
 fi
 
-if [[ "${CU_VERSION::4}" < "cu12" ]]; then
-  # replace dependencies from tensorrt-cu12-bindings/libs to tensorrt-cu11-bindings/libs
-  sed -i -e "s/tensorrt-cu12/tensorrt-${CU_VERSION::4}/g" \
-         -e "s/tensorrt-cu12-bindings/tensorrt-${CU_VERSION::4}-bindings/g" \
-         -e "s/tensorrt-cu12-libs/tensorrt-${CU_VERSION::4}-libs/g" \
-         pyproject.toml
-fi
-
 TORCH=$(grep "^torch>" py/requirements.txt)
 INDEX_URL=https://download.pytorch.org/whl/${CHANNEL}/${CU_VERSION}
 
diff --git a/py/requirements.txt b/py/requirements.txt
index 00cb832331..302b7e92af 100644
--- a/py/requirements.txt
+++ b/py/requirements.txt
@@ -1,9 +1,8 @@
 numpy
 packaging
 pybind11==2.6.2
---extra-index-url https://download.pytorch.org/whl/nightly/cu128
+--extra-index-url https://download.pytorch.org/whl/nightly/cu129
 torch>=2.8.0.dev,<2.9.0
-torchvision>=0.22.0.dev,<0.23.0
 --extra-index-url https://pypi.ngc.nvidia.com
 pyyaml
 dllist
\ No newline at end of file
diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt
index dd217bfd5c..b806a668db 100644
--- a/tests/py/requirements.txt
+++ b/tests/py/requirements.txt
@@ -7,7 +7,12 @@ parameterized>=0.2.0
 pytest>=8.2.1
 pytest-xdist>=3.6.1
 pyyaml
-timm>=1.0.3
 transformers==4.49.0
-nvidia-modelopt[all]~=0.27.0; python_version >'3.9' and python_version <'3.13'
---extra-index-url https://pypi.nvidia.com
\ No newline at end of file
+nvidia-modelopt[all]; python_version >'3.9' and python_version <'3.13'
+--extra-index-url https://pypi.nvidia.com
+# flashinfer-python is not supported for python version 3.13 or higher
+# flashinfer-python is broken on python 3.9 at the moment, so skip it for now
+flashinfer-python; python_version >'3.9' and python_version <'3.13'
+--extra-index-url https://download.pytorch.org/whl/nightly/cu129
+torchvision>=0.23.0.dev,<0.24.0
+timm>=1.0.3
\ No newline at end of file
diff --git a/toolchains/ci_workspaces/MODULE.bazel.tmpl b/toolchains/ci_workspaces/MODULE.bazel.tmpl
index 562b55c6ce..4f03473c08 100644
--- a/toolchains/ci_workspaces/MODULE.bazel.tmpl
+++ b/toolchains/ci_workspaces/MODULE.bazel.tmpl
@@ -65,7 +65,7 @@ http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "ht
 #    name = "libtorch",
 #    build_file = "@//third_party/libtorch:BUILD",
 #    strip_prefix = "libtorch",
-#    urls = ["https://download.pytorch.org/libtorch/${CHANNEL}/${CU_VERSION}/libtorch-cxx11-abi-shared-with-deps-latest.zip"],
+#    urls = ["https://download.pytorch.org/libtorch/${CHANNEL}/${CU_VERSION}/libtorch-shared-with-deps-latest.zip"],
 #)
 
 # Download these tarballs manually from the NVIDIA website