Skip to content

Fix cu12 size

Fix cu12 size #893

Workflow file for this run

name: "build-test"
on: # rebuild any PRs and main branch changes
pull_request:
push:
branches:
- main
- "releases/*"
concurrency:
group: test-${{ github.ref }}
cancel-in-progress: true
env:
OSS_ACCESS_KEY_ID: ${{ secrets.OSS_ACCESS_KEY_ID }}
OSS_ACCESS_KEY_SECRET: ${{ secrets.OSS_ACCESS_KEY_SECRET }}
ONEFLOW_REF: master
ONEFLOW_CI_BUILD_PARALLEL: 32
jobs:
build: # make sure build/ci work properly
name: "Build and test this repo"
runs-on: ubuntu-latest
env:
ONEFLOW_SRC: oneflow-src
steps:
- uses: actions/checkout@v2
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/oneflow
path: ${{ env.ONEFLOW_SRC }}
- run: |
npm install
- run: |
npm run all
test-build-oneflow:
name: "Build OneFlow Conda"
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
max-parallel: 5
matrix:
entry: [gh-hosted, self-hosted]
include:
- entry: gh-hosted
os: [ubuntu-latest]
- entry: self-hosted
os: ["self-hosted", "linux", "provision"]
env:
ONEFLOW_SRC: oneflow-src
steps:
- name: Fix permissions
if: ${{ contains(matrix.os, 'self-hosted') }}
run: |
set -x
docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) .
- name: Remove leftover cuda-installer.log
if: ${{ contains(matrix.os, 'self-hosted') }}
run: |
docker run --rm -v /tmp:/host/tmp -w /p busybox rm -f /host/tmp/cuda-installer.log
- uses: actions/checkout@v2
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/oneflow
ref: ${{ env.ONEFLOW_REF }}
path: ${{ env.ONEFLOW_SRC }}
- uses: ./cache-complete
name: Save cache if successful
id: save-cache
timeout-minutes: 5
with:
oneflow-src: ${{ env.ONEFLOW_SRC }}
entry: build-with-clang
digest-type: build
mark-as-completed: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
- name: Checkout Oneflow-Inc/conda-env
if: ${{ !matrix.cache-hit }}
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/conda-env
ref: 30a7f00eb48ee9009d85a848e720823e5054c66b
path: conda-env
- uses: ./
name: Test conda on gh-hosted
if: ${{ contains(matrix.os, 'ubuntu-latest') && !matrix.cache-hit }}
with:
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/gh-hosted/cpu-clang.cmake
oneflow-src: ${{ env.ONEFLOW_SRC }}
oneflow-build-env: conda
conda-env-file: conda-env/dev/clang10/environment-v2.yml
conda-env-name: oneflow-dev-clang10-v2
- uses: ./
name: Test conda on self-hosted
if: ${{ contains(matrix.os, 'self-hosted') && !matrix.cache-hit }}
with:
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/cn/fast/cpu-clang.cmake
oneflow-src: ${{ env.ONEFLOW_SRC }}
oneflow-build-env: conda
conda-env-file: conda-env/dev/clang10/environment-v2.yml
conda-installer-url: https://oneflow-static.oss-cn-beijing.aliyuncs.com/downloads/conda-installers/Miniconda3-py39_4.10.3-Linux-x86_64.sh
conda-prefix: ~/miniconda3-prefixes/py39_4.10.3
self-hosted: ${{ contains(matrix.os, 'self-hosted') }}
conda-env-name: oneflow-dev-clang10-v2
parallel: ${{ env.ONEFLOW_CI_BUILD_PARALLEL }}
find-build-cache:
name: "Find build cache"
runs-on: ubuntu-latest
env:
ONEFLOW_SRC: oneflow-src
outputs:
matrix: ${{ steps.find-cache.outputs.matrix }}
steps:
- uses: actions/checkout@v2
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/oneflow
ref: ${{ env.ONEFLOW_REF }}
path: oneflow-src
- uses: ./cache-complete/matrix/build
name: find cache
id: find-cache
with:
delete-cache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
runner-labels: |
self-hosted
linux
provision
oneflow-src: ${{ env.ONEFLOW_SRC }}
entries: |
cu102
cpu
cu102_xla
test-build-manylinux:
name: "Build OneFlow manylinux"
runs-on: ${{ matrix.runs-on }}
needs: [find-build-cache]
concurrency:
group: test-build-manylinux-${{ matrix.entry }}-${{ matrix.build-digest }}
cancel-in-progress: true
strategy:
fail-fast: false
max-parallel: 1
matrix: ${{ fromJson(needs.find-build-cache.outputs.matrix) }}
env:
ONEFLOW_SRC: oneflow-src
MANYLINUX_CACHE_DIR: ~/manylinux-cache-dir/${{ matrix.entry }}
WHEELHOUSE_DIR: manylinux-wheelhouse
SSH_TANK_HOST: 192.168.1.23
SSH_TANK_PATH: /home/ci-user/tank
SSH_USERNAME: ci-user
steps:
- name: Fix permissions
if: ${{ contains(matrix.runs-on, 'self-hosted') }}
run: |
set -x
docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) .
- name: Remove leftover cuda-installer.log
if: ${{ contains(matrix.runs-on, 'self-hosted') }}
run: |
docker run --rm -v /tmp:/host/tmp -w /p busybox rm -f /host/tmp/cuda-installer.log
- name: Checkout this repo
uses: actions/checkout@v2
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/oneflow
ref: ${{ env.ONEFLOW_REF }}
path: ${{ env.ONEFLOW_SRC }}
- uses: ./cache-complete
name: Save cache if successful
id: save-cache
with:
oneflow-src: ${{ env.ONEFLOW_SRC }}
entry: ${{ matrix.entry }}
digest-type: build
mark-as-completed: ${{ contains(matrix.runs-on, 'self-hosted') }}
- name: Check digest and fail if cache result not identical to matrix
if: ${{ fromJSON(steps.save-cache.outputs.cache-hit) != matrix.cache-hit }}
run: |
echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit"
exit 1
- uses: ./
name: Build manylinux cpu only
if: ${{ matrix.entry =='cpu' && !matrix.cache-hit }}
with:
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/cpu.cmake
build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build.sh
run-lit: true
oneflow-src: ${{ env.ONEFLOW_SRC }}
oneflow-build-env: manylinux
wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }}
clear-wheelhouse-dir: true
self-hosted: ${{ contains(matrix.runs-on, 'self-hosted') }}
cuda-version: none
manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }}
docker-run-use-system-http-proxy: false
docker-run-use-lld: true
retry-failed-build: true
clean-ccache: true
parallel: ${{ env.ONEFLOW_CI_BUILD_PARALLEL }}
python-versions: |
3.6
3.7
- uses: ./
name: Build manylinux cu102
if: ${{ matrix.entry =='cu102' && !matrix.cache-hit }}
with:
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/cuda.cmake
build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build-gcc7.sh
oneflow-src: ${{ env.ONEFLOW_SRC }}
oneflow-build-env: manylinux
wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }}
clear-wheelhouse-dir: true
self-hosted: ${{ contains(matrix.runs-on, 'self-hosted') }}
cuda-version: "10.2"
manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }}
docker-run-use-system-http-proxy: false
docker-run-use-lld: false
retry-failed-build: true
parallel: ${{ env.ONEFLOW_CI_BUILD_PARALLEL }}
python-versions: |
3.6
3.7
- uses: ./
name: Build manylinux cu102_xla
if: ${{ matrix.entry =='cu102_xla' && !matrix.cache-hit }}
with:
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/cuda-xla.cmake
build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build-gcc7.sh
oneflow-src: ${{ env.ONEFLOW_SRC }}
oneflow-build-env: manylinux
wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }}
clear-wheelhouse-dir: true
self-hosted: ${{ contains(matrix.runs-on, 'self-hosted') }}
cuda-version: "10.2"
manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }}
docker-run-use-system-http-proxy: true
docker-run-use-lld: true
retry-failed-build: true
parallel: ${{ env.ONEFLOW_CI_BUILD_PARALLEL }}
python-versions: |
3.6
wheel-audit: false
- name: Copy key
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry !='cu102_xla' }}
run: cp ~/.ssh/id_rsa $RUNNER_TEMP/_github_workflow
- name: Upload whl
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry !='cu102_xla' }}
uses: appleboy/scp-action@master
with:
host: ${{ env.SSH_TANK_HOST }}
username: ${{ env.SSH_USERNAME }}
source: "${{ env.WHEELHOUSE_DIR }}/*.whl"
key_path: /github/workflow/id_rsa
target: "${{ env.SSH_TANK_PATH }}/digest/${{ steps.save-cache.outputs.build-digest }}/${{ matrix.entry }}/whl"
rm: true
strip_components: 1
- name: Copy CPack dir
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry !='cu102_xla' }}
run: cp -r ${{ env.MANYLINUX_CACHE_DIR }}/build/cpack cpack
- name: Upload packed liboneflow
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry !='cu102_xla' }}
uses: appleboy/scp-action@master
with:
host: ${{ env.SSH_TANK_HOST }}
username: ${{ env.SSH_USERNAME }}
key_path: /github/workflow/id_rsa
source: "cpack/*.zip"
target: "${{ env.SSH_TANK_PATH }}/digest/${{ steps.save-cache.outputs.build-digest }}/${{ matrix.entry }}/cpack"
rm: true
strip_components: 1
check-build-cache:
name: "Check manylinux build cache"
runs-on: ubuntu-latest
needs: [test-build-manylinux]
strategy:
fail-fast: false
max-parallel: 5
matrix:
entry: ["cu102_xla", "cu102", "cpu"]
env:
ONEFLOW_SRC: oneflow-src
steps:
- name: Checkout this repo
uses: actions/checkout@v2
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/oneflow
ref: ${{ env.ONEFLOW_REF }}
path: ${{ env.ONEFLOW_SRC }}
- uses: ./cache-complete
name: find cache
id: find-cache
with:
runner-labels: |
self-hosted
linux
provision
oneflow-src: ${{ env.ONEFLOW_SRC }}
entry: ${{ matrix.entry }}
mark-as-completed: false
digest-type: build
- name: Check built
if: ${{ !fromJSON(steps.find-cache.outputs.cache-hit) }}
run: |
exit 1
find-test-cache:
name: "Find test cache"
runs-on: ubuntu-latest
needs: [test-build-manylinux]
env:
ONEFLOW_SRC: oneflow-src
outputs:
matrix: ${{ steps.find-cache.outputs.matrix }}
steps:
- uses: actions/checkout@v2
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/oneflow
ref: ${{ env.ONEFLOW_REF }}
path: oneflow-src
- uses: ./cache-complete/matrix/test
name: find cache
id: find-cache
with:
runner-labels: |
self-hosted
linux
provision
oneflow-src: ${{ env.ONEFLOW_SRC }}
devices: |
cuda
cpu
tests: |
module
misc
speed-test
test-oneflow-doctor:
name: Test suite
runs-on: ${{ matrix.runs-on }}
needs: [find-test-cache]
strategy:
fail-fast: false
max-parallel: 5
matrix: ${{ fromJson(needs.find-test-cache.outputs.matrix) }}
env:
ONEFLOW_SRC: oneflow-src
TEST_CONTAINER_NAME: "pr-${{ github.event.pull_request.number }}-run-id-${{ github.run_id }}-${{ matrix.entry }}-test"
TEST_WITH_TF_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-tf-2.4.0:b8b5eb2bdee6928fefd61ccabf8fb2d680835aeb
TEST_WITH_TORCH_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.9.0:e7a497b41d8b7f1bce055b1f23d027f93b1557ae
PIP_INDEX_URL: "http://192.168.1.22:8000"
PIP_INDEX_TRUST_ARGS: "--trusted-host 192.168.1.22"
SSH_TANK_HOST: 192.168.1.23
SSH_TANK_PATH: /home/ci-user/tank
steps:
- name: Fix permissions
if: ${{ contains(matrix.runs-on, 'self-hosted') }}
run: |
set -x
docker run --rm -v $PWD:/p -w /p busybox chown -R $(id -u):$(id -g) .
- uses: actions/checkout@v2
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/oneflow
ref: ${{ env.ONEFLOW_REF }}
path: oneflow-src
- name: Remove container
timeout-minutes: 45
if: ${{ contains(matrix.runs-on, 'self-hosted') }}
run: |
docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
- uses: ./cache-complete
name: Save cache if successful
id: save-cache
with:
oneflow-src: ${{ env.ONEFLOW_SRC }}
entry: ${{ matrix.entry }}
digest-type: ${{ matrix.digest-type }}
mark-as-completed: ${{ contains(matrix.runs-on, 'self-hosted') }}
- name: Check digest and fail if cache result not identical to matrix
if: ${{ fromJSON(steps.save-cache.outputs.cache-hit) != matrix.cache-hit }}
run: |
echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit"
exit 1
- name: Download wheel and binary
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }}
uses: ./digest/download
id: download-digest
with:
digest: ${{ steps.save-cache.outputs.build-digest }}
entry: ${{ matrix.compute-platform }}
ssh-tank-host: ${{ env.SSH_TANK_HOST }}
ssh-tank-path: ${{ env.SSH_TANK_PATH }}
- name: Set tf container
if: ${{ fromJSON(matrix.is-single-client) }}
run: |
echo "TEST_IMG_TAG=${TEST_WITH_TF_IMG_TAG}" >> $GITHUB_ENV
- name: Set pytorch container
if: ${{ !fromJSON(matrix.is-single-client) }}
run: |
echo "TEST_IMG_TAG=${TEST_WITH_TORCH_IMG_TAG}" >> $GITHUB_ENV
- name: Unzip packed liboneflow
working-directory: ${{ env.ONEFLOW_SRC }}
env:
ONEFLOW_CPACK_PATH: ${{ steps.download-digest.outputs.entry-dir }}/cpack
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }}
run: |
unzip ${{ env.ONEFLOW_CPACK_PATH }}/liboneflow-ci-linux.zip
- name: Start container
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }}
working-directory: ${{ env.ONEFLOW_SRC }}
env:
ONEFLOW_WHEEL_PATH: ${{ steps.download-digest.outputs.entry-dir }}/whl
ONEFLOW_BIN_PATH: ${{ steps.download-digest.outputs.entry-dir }}/bin
run: |
docker run -d --privileged --network host --shm-size=8g \
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--runtime=nvidia \
-v /dataset:/dataset:ro -v /model_zoo:/model_zoo:ro \
-v ${ONEFLOW_WHEEL_PATH}:${ONEFLOW_WHEEL_PATH}:ro \
-v ${ONEFLOW_BIN_PATH}:${ONEFLOW_BIN_PATH}:ro \
-v $HOME/test-container-cache/dot-local:/root/.local \
-v $HOME/test-container-cache/dot-cache:/root/.cache \
-e ONEFLOW_WHEEL_PATH=${ONEFLOW_WHEEL_PATH} \
-e ONEFLOW_BIN_PATH=${ONEFLOW_BIN_PATH} \
-e ONEFLOW_CI=1 \
-e TERM="xterm-256color" \
-v $PWD:$PWD \
-w $PWD \
--name ${{ env.TEST_CONTAINER_NAME }} \
${TEST_IMG_TAG} \
sleep 3600
- name: Test container
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }}
run: |
docker exec ${{ env.TEST_CONTAINER_NAME }} ls
- name: Run oneflow_testexe
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }}
run: |
docker exec ${{ env.TEST_CONTAINER_NAME }} ./liboneflow-ci-linux/bin/oneflow_testexe
- name: Install OneFlow
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }}
env:
ONEFLOW_WHEEL_PATH: ${{ steps.download-digest.outputs.entry-dir }}/whl
run: |
ls ${{ env.ONEFLOW_WHEEL_PATH }}
docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m pip install --find-links=${{ env.ONEFLOW_WHEEL_PATH }} oneflow
- name: Run OneFlow doctor
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type != 'do-nothing' }}
run: |
docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow --doctor
- name: Remove container
timeout-minutes: 45
if: ${{ always() && contains(matrix.runs-on, 'self-hosted') }}
run: |
docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true