Skip to content

Fix cu12 size

Fix cu12 size #281

name: Benchmark
on:
pull_request:
types: [opened, review_requested, ready_for_review, synchronize, unlocked]
concurrency:
group: benchmark-${{ github.ref }}
cancel-in-progress: true
env:
OSS_ACCESS_KEY_ID: ${{ secrets.OSS_ACCESS_KEY_ID }}
OSS_ACCESS_KEY_SECRET: ${{ secrets.OSS_ACCESS_KEY_SECRET }}
ONEFLOW_TIMEOUT_SECONDS: 90
FLOW_VISION_SRC: flow_vision
TEST_WITH_TORCH_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.9.0-cuda10.2-cudnn7-runtime:70729b0680b5a32daba6f50b56e0c169cd1636fa
ONEFLOW_SRC: oneflow-src
ONEFLOW_REF: master
jobs:
find-benchmark-cache:
name: "Find benchmark cache"
if: github.event.pull_request.draft == false && github.base_ref == 'main'
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.find-cache.outputs.matrix }}
steps:
- uses: actions/checkout@v2
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/oneflow
ref: ${{ env.ONEFLOW_REF }}
path: ${{ env.ONEFLOW_SRC }}
- uses: ./cache-complete/matrix/test
name: find cache
id: find-cache
timeout-minutes: 5
with:
runner-labels: |
self-hosted
linux
provision
oneflow-src: ${{ env.ONEFLOW_SRC }}
devices: |
cuda
tests: |
benchmark
benchmark:
name: Benchmark suite
runs-on: ${{ matrix.runs-on }}
if: github.event.pull_request.draft == false && github.base_ref == 'main'
needs: [find-benchmark-cache]
strategy:
fail-fast: true
max-parallel: 1
matrix: ${{ fromJson(needs.find-benchmark-cache.outputs.matrix) }}
env:
ONEFLOW_SRC: .
TEST_CONTAINER_NAME: "ci-benchmark"
SSH_TANK_HOST: 192.168.1.13
SSH_TANK_PATH: /tank
steps:
- name: Fix permissions
if: ${{ contains(matrix.runs-on, 'self-hosted') }}
run: |
set -x
docker run --rm -v $PWD:$PWD -w $PWD busybox rm -rf *
docker run --rm -v $PWD:$PWD -w $PWD busybox rm -rf .pytest_cache
- name: Checkout Oneflow-Inc/oneflow
uses: actions/checkout@v2
with:
ref: ${{ env.ONEFLOW_REF }}
repository: Oneflow-Inc/oneflow
- name: Checkout Oneflow-Inc/vision
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
uses: actions/checkout@v2
with:
repository: Oneflow-Inc/vision
# please use a commit here
ref: ca8ebc663b58667cf8cd1b6ef0c861522780b7bb
path: ${{ env.FLOW_VISION_SRC}}
- uses: actions/checkout@v2
with:
path: get-oneflow
- name: Remove container
timeout-minutes: 45
if: ${{ contains(matrix.runs-on, 'self-hosted') }}
run: |
docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
- name: Set environment variables
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
run: |
set -x
echo "ONEFLOW_TEST_CACHE_DIR=$HOME/ci-cache/test_cache" >> $GITHUB_ENV
- name: Set environment variables (distributed)
if: ${{ fromJson(matrix.is-distributed) }}
run: |
set -x
EXTRA_DOCKER_ARGS+=" --network host "
echo "EXTRA_DOCKER_ARGS=${EXTRA_DOCKER_ARGS}" >> $GITHUB_ENV
- name: Enable ONEFLOW_TEST_VERBOSE
if: ${{ contains(github.event.pull_request.labels.*.name, 'need-test-verbose') }}
run: |
EXTRA_DOCKER_ARGS+=" --env ONEFLOW_TEST_VERBOSE=1"
echo "EXTRA_DOCKER_ARGS=${EXTRA_DOCKER_ARGS}" >> $GITHUB_ENV
- name: Start container
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
working-directory: ${{ env.ONEFLOW_SRC }}
run: |
docker run -d --rm --privileged --shm-size=8g \
--pids-limit -1 \
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--runtime=nvidia \
-v /dataset:/dataset:ro -v /model_zoo:/model_zoo:ro \
-v ${ONEFLOW_WHEEL_PATH}:${ONEFLOW_WHEEL_PATH}:ro \
-v $HOME/test-container-cache/dot-local:/root/.local \
-v $HOME/test-container-cache/dot-cache:/root/.cache \
-e ONEFLOW_WHEEL_PATH=${ONEFLOW_WHEEL_PATH} \
-e ONEFLOW_CI=1 \
-v $PWD:$PWD \
-w $PWD \
-v ${ONEFLOW_TEST_CACHE_DIR}:${ONEFLOW_TEST_CACHE_DIR} \
-e CUDA_VISIBLE_DEVICES=1 \
-e ONEFLOW_TEST_CACHE_DIR=${ONEFLOW_TEST_CACHE_DIR} \
-e ONEFLOW_TIMEOUT_SECONDS=${{ env.ONEFLOW_TIMEOUT_SECONDS }} \
-e ONEFLOW_MLIR_ENABLE_ROUND_TRIP=1 \
--name ${TEST_CONTAINER_NAME} \
${{ env.EXTRA_DOCKER_ARGS }} \
${{ env.TEST_WITH_TORCH_IMG_TAG }} \
sleep 5400
- name: Test container
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
run: |
docker exec ${{ env.TEST_CONTAINER_NAME }} ls
- name: Install OneFlow
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
run: |
ls ${ONEFLOW_WHEEL_PATH}
docker exec ${TEST_CONTAINER_NAME} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
docker exec ${TEST_CONTAINER_NAME} python3 -m pip install --pre oneflow -f https://staging.oneflow.info/branch/master/cu102
- name: Install Flow Vision
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
run: |
docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.FLOW_VISION_SRC}}
- name: Benchmark Test
timeout-minutes: 100
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'benchmark' && matrix.device == 'cuda' }}
uses: ./get-oneflow/pytest-benchmark
with:
collect-path: ${{ env.FLOW_VISION_SRC }}/benchmark
container-name: ${{ env.TEST_CONTAINER_NAME }}
unknown-threshold: 10
error-threshold: 40
- name: Remove container
timeout-minutes: 45
if: ${{ always() && contains(matrix.runs-on, 'self-hosted') }}
run: |
docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
docker run --rm -v $PWD:$PWD -w $PWD busybox rm -rf *
update-history:
runs-on: ubuntu-latest
needs: [benchmark]
steps:
- uses: actions/checkout@v2
- uses: ./update-benchmark-history
name: Update benchmark history
timeout-minutes: 5