-
Notifications
You must be signed in to change notification settings - Fork 295
74 lines (72 loc) · 2.85 KB
/
pr-gpu-stability-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
name: TorchBench GPU model stability test
on:
workflow_dispatch:
inputs:
model:
description: "Model Name"
required: true
default: "fastNLP_Bert"
pull_request:
jobs:
stability_test:
env:
CONDA_ENV: "stability-test-ci"
TEST_HOME: "/tmp/tb-stability-ci"
PYTHON_VERSION: "3.8"
CUDA_VERSION: "cu116"
PR_BODY: ${{ github.event.pull_request.body }}
MODEL: ${{ github.event.inputs.model }}
GPU_ID: "1"
GPU_FREQ: "5001,900"
REPEAT: "10"
if: ${{ (github.event.inputs.model || contains(github.event.pull_request.body, 'STABLE_TEST_MODEL:')) }}
runs-on: [self-hosted, bm-runner]
timeout-minutes: 120 # 2 hours
environment: docker-s3-upload
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Create conda environment with pytorch nightly
run: |
conda create -y -n "${CONDA_ENV}" python="${PYTHON_VERSION}"
. activate "${CONDA_ENV}"
conda install -y numpy requests=2.22 ninja pyyaml mkl mkl-include setuptools \
cmake cffi typing_extensions future six dataclasses tabulate gitpython
# Install pytorch nightly
pip install --pre torch torchvision torchaudio \
-f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html
# Install torchbench dependencies
python install.py
- name: Stability test
run: |
. activate "${CONDA_ENV}"
mkdir -p "${TEST_HOME}"
if [ -z "${MODEL}" ] ; then
# Load PR to file
PR_BODY_FILE="${TEST_HOME}"/pr-body.txt
echo "${PR_BODY}" > "${PR_BODY_FILE}"
MODEL=`python ./.github/scripts/test-repeated-runs.py --pr-body "${PR_BODY_FILE}"`
fi
# Setup nvidia gpu frequency
sudo nvidia-persistenced --user "${USER}" || true
sudo nvidia-smi -pm "${GPU_ID}"
sudo nvidia-smi -ac "${GPU_FREQ}"
# Run the tests
EVAL_LOG="${TEST_HOME}/eval-${MODEL}.log"
echo -n > "${EVAL_LOG}"
for i in `seq 1 ${REPEAT}`; do
python run.py "${MODEL}" -t eval -d cuda | tee -a "${EVAL_LOG}"
done
TRAIN_LOG="${TEST_HOME}/train-${MODEL}.log"
echo -n > "${TRAIN_LOG}"
for i in `seq 1 ${REPEAT}`; do
python run.py "${MODEL}" -t train -d cuda | tee -a "${TRAIN_LOG}"
done
# Check the stability of GPU tests
python ./.github/scripts/test-repeated-runs.py --log "${EVAL_LOG}" && \
echo "GPU stability test pass for inference!"
python ./.github/scripts/test-repeated-runs.py --log "${TRAIN_LOG}" && \
echo "GPU stability test pass for train!"
- name: Remove conda environment
run: |
conda env remove --name "${CONDA_ENV}"