Whl skip nccl (#125) #899
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: "build-test" | |
on: # rebuild any PRs and main branch changes | |
pull_request: | |
push: | |
branches: | |
- main | |
- "releases/*" | |
concurrency: | |
group: test-${{ github.ref }} | |
cancel-in-progress: true | |
env: | |
OSS_ACCESS_KEY_ID: ${{ secrets.OSS_ACCESS_KEY_ID }} | |
OSS_ACCESS_KEY_SECRET: ${{ secrets.OSS_ACCESS_KEY_SECRET }} | |
ONEFLOW_REF: master | |
ONEFLOW_CI_BUILD_PARALLEL: 32 | |
jobs: | |
build: # make sure build/ci work properly | |
name: "Build and test this repo" | |
runs-on: ubuntu-latest | |
env: | |
ONEFLOW_SRC: oneflow-src | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Checkout Oneflow-Inc/oneflow | |
uses: actions/checkout@v2 | |
with: | |
repository: Oneflow-Inc/oneflow | |
path: ${{ env.ONEFLOW_SRC }} | |
- run: | | |
npm install | |
- run: | | |
npm run all | |
test-build-oneflow: | |
name: "Build OneFlow Conda" | |
runs-on: ${{ matrix.os }} | |
strategy: | |
fail-fast: false | |
max-parallel: 5 | |
matrix: | |
entry: [gh-hosted, self-hosted] | |
include: | |
- entry: gh-hosted | |
os: [ubuntu-latest] | |
- entry: self-hosted | |
os: ["self-hosted", "linux", "provision"] | |
env: | |
ONEFLOW_SRC: oneflow-src | |
steps: | |
- name: Fix permissions | |
if: ${{ contains(matrix.os, 'self-hosted') }} | |
run: | | |
set -x | |
docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) . | |
- name: Remove leftover cuda-installer.log | |
if: ${{ contains(matrix.os, 'self-hosted') }} | |
run: | | |
docker run --rm -v /tmp:/host/tmp -w /p busybox rm -f /host/tmp/cuda-installer.log | |
- uses: actions/checkout@v2 | |
- name: Checkout Oneflow-Inc/oneflow | |
uses: actions/checkout@v2 | |
with: | |
repository: Oneflow-Inc/oneflow | |
ref: ${{ env.ONEFLOW_REF }} | |
path: ${{ env.ONEFLOW_SRC }} | |
- uses: ./cache-complete | |
name: Save cache if successful | |
id: save-cache | |
timeout-minutes: 5 | |
with: | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
entry: build-with-clang | |
digest-type: build | |
mark-as-completed: ${{ github.event.pull_request.head.repo.full_name == github.repository }} | |
- name: Checkout Oneflow-Inc/conda-env | |
if: ${{ !matrix.cache-hit }} | |
uses: actions/checkout@v2 | |
with: | |
repository: Oneflow-Inc/conda-env | |
ref: 30a7f00eb48ee9009d85a848e720823e5054c66b | |
path: conda-env | |
- uses: ./ | |
name: Test conda on gh-hosted | |
if: ${{ contains(matrix.os, 'ubuntu-latest') && !matrix.cache-hit }} | |
with: | |
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/gh-hosted/cpu-clang.cmake | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
oneflow-build-env: conda | |
conda-env-file: conda-env/dev/clang10/environment-v2.yml | |
conda-env-name: oneflow-dev-clang10-v2 | |
- uses: ./ | |
name: Test conda on self-hosted | |
if: ${{ contains(matrix.os, 'self-hosted') && !matrix.cache-hit }} | |
with: | |
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/cn/fast/cpu-clang.cmake | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
oneflow-build-env: conda | |
conda-env-file: conda-env/dev/clang10/environment-v2.yml | |
conda-installer-url: https://oneflow-static.oss-cn-beijing.aliyuncs.com/downloads/conda-installers/Miniconda3-py39_4.10.3-Linux-x86_64.sh | |
conda-prefix: ~/miniconda3-prefixes/py39_4.10.3 | |
self-hosted: ${{ contains(matrix.os, 'self-hosted') }} | |
conda-env-name: oneflow-dev-clang10-v2 | |
parallel: ${{ env.ONEFLOW_CI_BUILD_PARALLEL }} | |
find-build-cache: | |
name: "Find build cache" | |
runs-on: ubuntu-latest | |
env: | |
ONEFLOW_SRC: oneflow-src | |
outputs: | |
matrix: ${{ steps.find-cache.outputs.matrix }} | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Checkout Oneflow-Inc/oneflow | |
uses: actions/checkout@v2 | |
with: | |
repository: Oneflow-Inc/oneflow | |
ref: ${{ env.ONEFLOW_REF }} | |
path: oneflow-src | |
- uses: ./cache-complete/matrix/build | |
name: find cache | |
id: find-cache | |
with: | |
delete-cache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }} | |
runner-labels: | | |
self-hosted | |
linux | |
provision | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
entries: | | |
cu102 | |
cpu | |
cu102_xla | |
test-build-manylinux: | |
name: "Build OneFlow manylinux" | |
runs-on: ${{ matrix.runs-on }} | |
needs: [find-build-cache] | |
concurrency: | |
group: test-build-manylinux-${{ matrix.entry }}-${{ matrix.build-digest }} | |
cancel-in-progress: true | |
strategy: | |
fail-fast: false | |
max-parallel: 1 | |
matrix: ${{ fromJson(needs.find-build-cache.outputs.matrix) }} | |
env: | |
ONEFLOW_SRC: oneflow-src | |
MANYLINUX_CACHE_DIR: ~/manylinux-cache-dir/${{ matrix.entry }} | |
WHEELHOUSE_DIR: manylinux-wheelhouse | |
SSH_TANK_HOST: 192.168.1.23 | |
SSH_TANK_PATH: /home/ci-user/tank | |
SSH_USERNAME: ci-user | |
steps: | |
- name: Fix permissions | |
if: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
run: | | |
set -x | |
docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) . | |
- name: Remove leftover cuda-installer.log | |
if: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
run: | | |
docker run --rm -v /tmp:/host/tmp -w /p busybox rm -f /host/tmp/cuda-installer.log | |
- name: Checkout this repo | |
uses: actions/checkout@v2 | |
- name: Checkout Oneflow-Inc/oneflow | |
uses: actions/checkout@v2 | |
with: | |
repository: Oneflow-Inc/oneflow | |
ref: ${{ env.ONEFLOW_REF }} | |
path: ${{ env.ONEFLOW_SRC }} | |
- uses: ./cache-complete | |
name: Save cache if successful | |
id: save-cache | |
with: | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
entry: ${{ matrix.entry }} | |
digest-type: build | |
mark-as-completed: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
- name: Check digest and fail if cache result not identical to matrix | |
if: ${{ fromJSON(steps.save-cache.outputs.cache-hit) != matrix.cache-hit }} | |
run: | | |
echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit" | |
exit 1 | |
- uses: ./ | |
name: Build manylinux cpu only | |
if: ${{ matrix.entry =='cpu' && !matrix.cache-hit }} | |
with: | |
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/cpu.cmake | |
build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build.sh | |
run-lit: true | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
oneflow-build-env: manylinux | |
wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }} | |
clear-wheelhouse-dir: true | |
self-hosted: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
cuda-version: none | |
manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }} | |
docker-run-use-system-http-proxy: false | |
docker-run-use-lld: true | |
retry-failed-build: true | |
clean-ccache: true | |
parallel: ${{ env.ONEFLOW_CI_BUILD_PARALLEL }} | |
python-versions: | | |
3.6 | |
3.7 | |
- uses: ./ | |
name: Build manylinux cu102 | |
if: ${{ matrix.entry =='cu102' && !matrix.cache-hit }} | |
with: | |
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/cuda.cmake | |
build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build-gcc7.sh | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
oneflow-build-env: manylinux | |
wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }} | |
clear-wheelhouse-dir: true | |
self-hosted: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
cuda-version: "10.2" | |
manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }} | |
docker-run-use-system-http-proxy: false | |
docker-run-use-lld: false | |
retry-failed-build: true | |
parallel: ${{ env.ONEFLOW_CI_BUILD_PARALLEL }} | |
python-versions: | | |
3.6 | |
3.7 | |
- uses: ./ | |
name: Build manylinux cu102_xla | |
if: ${{ matrix.entry =='cu102_xla' && !matrix.cache-hit }} | |
with: | |
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/cuda-xla.cmake | |
build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build-gcc7.sh | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
oneflow-build-env: manylinux | |
wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }} | |
clear-wheelhouse-dir: true | |
self-hosted: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
cuda-version: "10.2" | |
manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }} | |
docker-run-use-system-http-proxy: true | |
docker-run-use-lld: true | |
retry-failed-build: true | |
parallel: ${{ env.ONEFLOW_CI_BUILD_PARALLEL }} | |
python-versions: | | |
3.6 | |
wheel-audit: false | |
- name: Copy key | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry !='cu102_xla' }} | |
run: cp ~/.ssh/id_rsa $RUNNER_TEMP/_github_workflow | |
- name: Upload whl | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry !='cu102_xla' }} | |
uses: appleboy/scp-action@master | |
with: | |
host: ${{ env.SSH_TANK_HOST }} | |
username: ${{ env.SSH_USERNAME }} | |
source: "${{ env.WHEELHOUSE_DIR }}/*.whl" | |
key_path: /github/workflow/id_rsa | |
target: "${{ env.SSH_TANK_PATH }}/digest/${{ steps.save-cache.outputs.build-digest }}/${{ matrix.entry }}/whl" | |
rm: true | |
strip_components: 1 | |
- name: Copy CPack dir | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry !='cu102_xla' }} | |
run: cp -r ${{ env.MANYLINUX_CACHE_DIR }}/build/cpack cpack | |
- name: Upload packed liboneflow | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry !='cu102_xla' }} | |
uses: appleboy/scp-action@master | |
with: | |
host: ${{ env.SSH_TANK_HOST }} | |
username: ${{ env.SSH_USERNAME }} | |
key_path: /github/workflow/id_rsa | |
source: "cpack/*.zip" | |
target: "${{ env.SSH_TANK_PATH }}/digest/${{ steps.save-cache.outputs.build-digest }}/${{ matrix.entry }}/cpack" | |
rm: true | |
strip_components: 1 | |
check-build-cache: | |
name: "Check manylinux build cache" | |
runs-on: ubuntu-latest | |
needs: [test-build-manylinux] | |
strategy: | |
fail-fast: false | |
max-parallel: 5 | |
matrix: | |
entry: ["cu102_xla", "cu102", "cpu"] | |
env: | |
ONEFLOW_SRC: oneflow-src | |
steps: | |
- name: Checkout this repo | |
uses: actions/checkout@v2 | |
- name: Checkout Oneflow-Inc/oneflow | |
uses: actions/checkout@v2 | |
with: | |
repository: Oneflow-Inc/oneflow | |
ref: ${{ env.ONEFLOW_REF }} | |
path: ${{ env.ONEFLOW_SRC }} | |
- uses: ./cache-complete | |
name: find cache | |
id: find-cache | |
with: | |
runner-labels: | | |
self-hosted | |
linux | |
provision | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
entry: ${{ matrix.entry }} | |
mark-as-completed: false | |
digest-type: build | |
- name: Check built | |
if: ${{ !fromJSON(steps.find-cache.outputs.cache-hit) }} | |
run: | | |
exit 1 | |
find-test-cache: | |
name: "Find test cache" | |
runs-on: ubuntu-latest | |
needs: [test-build-manylinux] | |
env: | |
ONEFLOW_SRC: oneflow-src | |
outputs: | |
matrix: ${{ steps.find-cache.outputs.matrix }} | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Checkout Oneflow-Inc/oneflow | |
uses: actions/checkout@v2 | |
with: | |
repository: Oneflow-Inc/oneflow | |
ref: ${{ env.ONEFLOW_REF }} | |
path: oneflow-src | |
- uses: ./cache-complete/matrix/test | |
name: find cache | |
id: find-cache | |
with: | |
runner-labels: | | |
self-hosted | |
linux | |
provision | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
devices: | | |
cuda | |
cpu | |
tests: | | |
module | |
misc | |
speed-test | |
test-oneflow-doctor: | |
name: Test suite | |
runs-on: ${{ matrix.runs-on }} | |
needs: [find-test-cache] | |
strategy: | |
fail-fast: false | |
max-parallel: 5 | |
matrix: ${{ fromJson(needs.find-test-cache.outputs.matrix) }} | |
env: | |
ONEFLOW_SRC: oneflow-src | |
TEST_CONTAINER_NAME: "pr-${{ github.event.pull_request.number }}-run-id-${{ github.run_id }}-${{ matrix.entry }}-test" | |
TEST_WITH_TF_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-tf-2.4.0:b8b5eb2bdee6928fefd61ccabf8fb2d680835aeb | |
TEST_WITH_TORCH_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.9.0:e7a497b41d8b7f1bce055b1f23d027f93b1557ae | |
PIP_INDEX_URL: "http://192.168.1.22:8000" | |
PIP_INDEX_TRUST_ARGS: "--trusted-host 192.168.1.22" | |
SSH_TANK_HOST: 192.168.1.23 | |
SSH_TANK_PATH: /home/ci-user/tank | |
steps: | |
- name: Fix permissions | |
if: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
run: | | |
set -x | |
docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) . | |
- uses: actions/checkout@v2 | |
- name: Checkout Oneflow-Inc/oneflow | |
uses: actions/checkout@v2 | |
with: | |
repository: Oneflow-Inc/oneflow | |
ref: ${{ env.ONEFLOW_REF }} | |
path: oneflow-src | |
- name: Remove container | |
timeout-minutes: 45 | |
if: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
run: | | |
docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true | |
- uses: ./cache-complete | |
name: Save cache if successful | |
id: save-cache | |
with: | |
oneflow-src: ${{ env.ONEFLOW_SRC }} | |
entry: ${{ matrix.entry }} | |
digest-type: ${{ matrix.digest-type }} | |
mark-as-completed: ${{ contains(matrix.runs-on, 'self-hosted') }} | |
- name: Check digest and fail if cache result not identical to matrix | |
if: ${{ fromJSON(steps.save-cache.outputs.cache-hit) != matrix.cache-hit }} | |
run: | | |
echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit" | |
exit 1 | |
- name: Download wheel and binary | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }} | |
uses: ./digest/download | |
id: download-digest | |
with: | |
digest: ${{ steps.save-cache.outputs.build-digest }} | |
entry: ${{ matrix.compute-platform }} | |
ssh-tank-host: ${{ env.SSH_TANK_HOST }} | |
ssh-tank-path: ${{ env.SSH_TANK_PATH }} | |
- name: Set tf container | |
if: ${{ fromJSON(matrix.is-single-client) }} | |
run: | | |
echo "TEST_IMG_TAG=${TEST_WITH_TF_IMG_TAG}" >> $GITHUB_ENV | |
- name: Set pytorch container | |
if: ${{ !fromJSON(matrix.is-single-client) }} | |
run: | | |
echo "TEST_IMG_TAG=${TEST_WITH_TORCH_IMG_TAG}" >> $GITHUB_ENV | |
- name: Unzip packed liboneflow | |
working-directory: ${{ env.ONEFLOW_SRC }} | |
env: | |
ONEFLOW_CPACK_PATH: ${{ steps.download-digest.outputs.entry-dir }}/cpack | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }} | |
run: | | |
unzip ${{ env.ONEFLOW_CPACK_PATH }}/liboneflow-ci-linux.zip | |
- name: Start container | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }} | |
working-directory: ${{ env.ONEFLOW_SRC }} | |
env: | |
ONEFLOW_WHEEL_PATH: ${{ steps.download-digest.outputs.entry-dir }}/whl | |
ONEFLOW_BIN_PATH: ${{ steps.download-digest.outputs.entry-dir }}/bin | |
run: | | |
docker run -d --privileged --network host --shm-size=8g \ | |
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ | |
--runtime=nvidia \ | |
-v /dataset:/dataset:ro -v /model_zoo:/model_zoo:ro \ | |
-v ${ONEFLOW_WHEEL_PATH}:${ONEFLOW_WHEEL_PATH}:ro \ | |
-v ${ONEFLOW_BIN_PATH}:${ONEFLOW_BIN_PATH}:ro \ | |
-v $HOME/test-container-cache/dot-local:/root/.local \ | |
-v $HOME/test-container-cache/dot-cache:/root/.cache \ | |
-e ONEFLOW_WHEEL_PATH=${ONEFLOW_WHEEL_PATH} \ | |
-e ONEFLOW_BIN_PATH=${ONEFLOW_BIN_PATH} \ | |
-e ONEFLOW_CI=1 \ | |
-e TERM="xterm-256color" \ | |
-v $PWD:$PWD \ | |
-w $PWD \ | |
--name ${{ env.TEST_CONTAINER_NAME }} \ | |
${TEST_IMG_TAG} \ | |
sleep 3600 | |
- name: Test container | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }} | |
run: | | |
docker exec ${{ env.TEST_CONTAINER_NAME }} ls | |
- name: Run oneflow_testexe | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }} | |
run: | | |
docker exec ${{ env.TEST_CONTAINER_NAME }} ./liboneflow-ci-linux/bin/oneflow_testexe | |
- name: Install OneFlow | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }} | |
env: | |
ONEFLOW_WHEEL_PATH: ${{ steps.download-digest.outputs.entry-dir }}/whl | |
run: | | |
ls ${{ env.ONEFLOW_WHEEL_PATH }} | |
docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple | |
docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m pip install --find-links=${{ env.ONEFLOW_WHEEL_PATH }} oneflow | |
- name: Run OneFlow doctor | |
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }} | |
run: | | |
docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow --doctor | |
- name: Remove container | |
timeout-minutes: 45 | |
if: ${{ always() && contains(matrix.runs-on, 'self-hosted') }} | |
run: | | |
docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true |