diff --git a/.github/scripts/install-cuda-aarch64.sh b/.github/scripts/install-cuda-aarch64.sh index 8301b333e4..3f0b3b2178 100755 --- a/.github/scripts/install-cuda-aarch64.sh +++ b/.github/scripts/install-cuda-aarch64.sh @@ -3,12 +3,16 @@ install_cuda_aarch64() { echo "install cuda ${CU_VERSION}" # CU_VERSION: cu128 --> CU_VER: 12-8 CU_VER=${CU_VERSION:2:2}-${CU_VERSION:4:1} + # CU_VERSION: cu129 --> CU_DOT_VER: 12.9 + CU_DOT_VER=${CU_VERSION:2:2}.${CU_VERSION:4:1} dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo + # nccl version must match libtorch_cuda.so was built with https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt dnf -y install cuda-compiler-${CU_VER}.aarch64 \ cuda-libraries-${CU_VER}.aarch64 \ - cuda-libraries-devel-${CU_VER}.aarch64 + cuda-libraries-devel-${CU_VER}.aarch64 \ + libnccl-2.27.3-1+cuda${CU_DOT_VER} libnccl-devel-2.27.3-1+cuda${CU_DOT_VER} libnccl-static-2.27.3-1+cuda${CU_DOT_VER} dnf clean all - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib64:$LD_LIBRARY_PATH ls -lart /usr/local/ nvcc --version echo "cuda ${CU_VER} installed successfully" diff --git a/.github/scripts/install-torch-tensorrt.sh b/.github/scripts/install-torch-tensorrt.sh index 109b9565f3..c567a6f172 100755 --- a/.github/scripts/install-torch-tensorrt.sh +++ b/.github/scripts/install-torch-tensorrt.sh @@ -2,7 +2,7 @@ set -x TORCH=$(grep "^torch>" ${PWD}/py/requirements.txt) -TORCHVISION=$(grep "^torchvision" ${PWD}/py/requirements.txt) +TORCHVISION=$(grep "^torchvision>" ${PWD}/tests/py/requirements.txt) INDEX_URL=https://download.pytorch.org/whl/${CHANNEL}/${CU_VERSION} PLATFORM=$(python -c "import sys; print(sys.platform)") @@ -14,8 +14,12 @@ fi # Install all the dependencies required for Torch-TensorRT pip install --pre -r ${PWD}/tests/py/requirements.txt +# dependencies in the tests/py/requirements.txt might install a different version of torch or torchvision +# eg. timm will install the latest torchvision, however we want to use the torchvision from nightly +# reinstall torch torchvisionto make sure we have the correct version +pip uninstall -y torch torchvision +pip install --force-reinstall --pre ${TORCHVISION} --index-url ${INDEX_URL} --no-deps pip install --force-reinstall --pre ${TORCH} --index-url ${INDEX_URL} -pip install --force-reinstall --pre ${TORCHVISION} --index-url ${INDEX_URL} # Install Torch-TensorRT diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml index f788f88dc9..51f3730d02 100644 --- a/.github/workflows/build-test-linux-x86_64.yml +++ b/.github/workflows/build-test-linux-x86_64.yml @@ -138,22 +138,7 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py - major=${PYTHON_VERSION%%.*} - minor=${PYTHON_VERSION#*.} - minor=${minor%%.*} - if (( major > 3 || (major == 3 && minor >= 13) )); then - echo "flashinfer-python is not supported for python version 3.13 or higher" - else - echo "Installing flashinfer-python" - # flashinfer-python is broken on python 3.9 at the moment, so we skip it for now - if (major == 3 && minor == 9); then - echo "Skipping flashinfer-python for python 3.9" - else - python -m pip install flashinfer-python --no-deps - fi - fi - cd dynamo + cd tests/py/dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml automatic_plugin/test_automatic_plugin.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml automatic_plugin/test_automatic_plugin_with_attrs.py diff --git a/.github/workflows/build-test-tensorrt-linux.yml b/.github/workflows/build-test-tensorrt-linux.yml index 0cf9eab4fd..81814ea719 100644 --- a/.github/workflows/build-test-tensorrt-linux.yml +++ b/.github/workflows/build-test-tensorrt-linux.yml @@ -94,6 +94,7 @@ jobs: build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }} pre-script: ${{ matrix.pre-script }} script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH @@ -130,6 +131,7 @@ jobs: build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }} pre-script: ${{ matrix.pre-script }} script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -160,6 +162,7 @@ jobs: build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }} pre-script: ${{ matrix.pre-script }} script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -190,6 +193,7 @@ jobs: build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }} pre-script: ${{ matrix.pre-script }} script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -220,6 +224,7 @@ jobs: build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }} pre-script: ${{ matrix.pre-script }} script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -252,6 +257,7 @@ jobs: build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }} pre-script: ${{ matrix.pre-script }} script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -284,6 +290,7 @@ jobs: build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }} pre-script: ${{ matrix.pre-script }} script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -316,6 +323,7 @@ jobs: build-matrix: ${{ needs.generate-tensorrt-matrix.outputs.matrix }} pre-script: ${{ matrix.pre-script }} script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . diff --git a/.github/workflows/build-test-tensorrt-windows.yml b/.github/workflows/build-test-tensorrt-windows.yml index 0c847a459c..427b689656 100644 --- a/.github/workflows/build-test-tensorrt-windows.yml +++ b/.github/workflows/build-test-tensorrt-windows.yml @@ -101,6 +101,7 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -133,6 +134,7 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -160,6 +162,7 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -187,6 +190,7 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -214,6 +218,7 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -243,6 +248,7 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -272,6 +278,7 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . @@ -300,6 +307,7 @@ jobs: build-matrix: ${{ needs.substitute-runner.outputs.matrix }} pre-script: packaging/driver_upgrade.bat script: | + set -euo pipefail export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . diff --git a/.github/workflows/windows-test.yml b/.github/workflows/windows-test.yml index a8b27c0aa9..8dc1b107d3 100644 --- a/.github/workflows/windows-test.yml +++ b/.github/workflows/windows-test.yml @@ -39,7 +39,10 @@ on: description: "Prevents a job from failing when a step fails. Set to true to allow a job to pass when exec script step fails." default: false type: boolean - + architecture: + description: 'CPU architecture to build for' + default: "x64" + type: string jobs: test: strategy: @@ -107,7 +110,7 @@ jobs: if: ${{ matrix.tensorrt == '' }} uses: actions/download-artifact@v4 with: - name: ${{ env.ARTIFACT_NAME }} + name: ${{ env.ARTIFACT_NAME }}${{ inputs.architecture }} path: ${{ runner.temp }}/artifacts/ - name: Download artifacts if: ${{ matrix.tensorrt != '' }} diff --git a/MODULE.bazel b/MODULE.bazel index a22e70f071..1ecaebba28 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -66,7 +66,7 @@ http_archive( name = "libtorch", build_file = "@//third_party/libtorch:BUILD", strip_prefix = "libtorch", - urls = ["https://download.pytorch.org/libtorch/nightly/cu128/libtorch-cxx11-abi-shared-with-deps-latest.zip"], + urls = ["https://download.pytorch.org/libtorch/nightly/cu128/libtorch-shared-with-deps-latest.zip"], ) # in aarch64 platform you can get libtorch via either local or wheel file diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh index 707edcd718..1f1a2120a9 100755 --- a/packaging/pre_build_script.sh +++ b/packaging/pre_build_script.sh @@ -66,14 +66,6 @@ if [[ ${TENSORRT_VERSION} != "" ]]; then pyproject.toml fi -if [[ "${CU_VERSION::4}" < "cu12" ]]; then - # replace dependencies from tensorrt-cu12-bindings/libs to tensorrt-cu11-bindings/libs - sed -i -e "s/tensorrt-cu12/tensorrt-${CU_VERSION::4}/g" \ - -e "s/tensorrt-cu12-bindings/tensorrt-${CU_VERSION::4}-bindings/g" \ - -e "s/tensorrt-cu12-libs/tensorrt-${CU_VERSION::4}-libs/g" \ - pyproject.toml -fi - cat toolchains/ci_workspaces/MODULE.bazel.tmpl | envsubst > MODULE.bazel if [[ ${TENSORRT_VERSION} != "" ]]; then diff --git a/packaging/pre_build_script_windows.sh b/packaging/pre_build_script_windows.sh index 7d089b602a..b5b62ebf05 100644 --- a/packaging/pre_build_script_windows.sh +++ b/packaging/pre_build_script_windows.sh @@ -17,14 +17,6 @@ if [[ ${TENSORRT_VERSION} != "" ]]; then pyproject.toml fi -if [[ "${CU_VERSION::4}" < "cu12" ]]; then - # replace dependencies from tensorrt-cu12-bindings/libs to tensorrt-cu11-bindings/libs - sed -i -e "s/tensorrt-cu12/tensorrt-${CU_VERSION::4}/g" \ - -e "s/tensorrt-cu12-bindings/tensorrt-${CU_VERSION::4}-bindings/g" \ - -e "s/tensorrt-cu12-libs/tensorrt-${CU_VERSION::4}-libs/g" \ - pyproject.toml -fi - TORCH=$(grep "^torch>" py/requirements.txt) INDEX_URL=https://download.pytorch.org/whl/${CHANNEL}/${CU_VERSION} diff --git a/py/requirements.txt b/py/requirements.txt index 00cb832331..302b7e92af 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -1,9 +1,8 @@ numpy packaging pybind11==2.6.2 ---extra-index-url https://download.pytorch.org/whl/nightly/cu128 +--extra-index-url https://download.pytorch.org/whl/nightly/cu129 torch>=2.8.0.dev,<2.9.0 -torchvision>=0.22.0.dev,<0.23.0 --extra-index-url https://pypi.ngc.nvidia.com pyyaml dllist \ No newline at end of file diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt index dd217bfd5c..b806a668db 100644 --- a/tests/py/requirements.txt +++ b/tests/py/requirements.txt @@ -7,7 +7,12 @@ parameterized>=0.2.0 pytest>=8.2.1 pytest-xdist>=3.6.1 pyyaml -timm>=1.0.3 transformers==4.49.0 -nvidia-modelopt[all]~=0.27.0; python_version >'3.9' and python_version <'3.13' ---extra-index-url https://pypi.nvidia.com \ No newline at end of file +nvidia-modelopt[all]; python_version >'3.9' and python_version <'3.13' +--extra-index-url https://pypi.nvidia.com +# flashinfer-python is not supported for python version 3.13 or higher +# flashinfer-python is broken on python 3.9 at the moment, so skip it for now +flashinfer-python; python_version >'3.9' and python_version <'3.13' +--extra-index-url https://download.pytorch.org/whl/nightly/cu129 +torchvision>=0.23.0.dev,<0.24.0 +timm>=1.0.3 \ No newline at end of file diff --git a/toolchains/ci_workspaces/MODULE.bazel.tmpl b/toolchains/ci_workspaces/MODULE.bazel.tmpl index 562b55c6ce..4f03473c08 100644 --- a/toolchains/ci_workspaces/MODULE.bazel.tmpl +++ b/toolchains/ci_workspaces/MODULE.bazel.tmpl @@ -65,7 +65,7 @@ http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "ht # name = "libtorch", # build_file = "@//third_party/libtorch:BUILD", # strip_prefix = "libtorch", -# urls = ["https://download.pytorch.org/libtorch/${CHANNEL}/${CU_VERSION}/libtorch-cxx11-abi-shared-with-deps-latest.zip"], +# urls = ["https://download.pytorch.org/libtorch/${CHANNEL}/${CU_VERSION}/libtorch-shared-with-deps-latest.zip"], #) # Download these tarballs manually from the NVIDIA website