Skip to content

Add support for reducing across the middle dimension for 3D matrices using the sum Triton kernel #3638

Add support for reducing across the middle dimension for 3D matrices using the sum Triton kernel

Add support for reducing across the middle dimension for 3D matrices using the sum Triton kernel #3638

name: TorchBench GPU model stability test
on:
workflow_dispatch:
inputs:
model:
description: "Model Name"
required: true
default: "fastNLP_Bert"
pull_request:
jobs:
stability_test:
env:
CONDA_ENV: "stability-test-ci"
TEST_HOME: "/tmp/tb-stability-ci"
PYTHON_VERSION: "3.8"
CUDA_VERSION: "cu116"
PR_BODY: ${{ github.event.pull_request.body }}
MODEL: ${{ github.event.inputs.model }}
GPU_ID: "1"
GPU_FREQ: "5001,900"
REPEAT: "10"
if: ${{ (github.event.inputs.model || contains(github.event.pull_request.body, 'STABLE_TEST_MODEL:')) }}
runs-on: [self-hosted, bm-runner]
timeout-minutes: 120 # 2 hours
environment: docker-s3-upload
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Create conda environment with pytorch nightly
run: |
conda create -y -n "${CONDA_ENV}" python="${PYTHON_VERSION}"
. activate "${CONDA_ENV}"
conda install -y numpy requests=2.22 ninja pyyaml mkl mkl-include setuptools \
cmake cffi typing_extensions future six dataclasses tabulate gitpython
# Install pytorch nightly
pip install --pre torch torchvision torchaudio \
-f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html
# Install torchbench dependencies
python install.py
- name: Stability test
run: |
. activate "${CONDA_ENV}"
mkdir -p "${TEST_HOME}"
if [ -z "${MODEL}" ] ; then
# Load PR to file
PR_BODY_FILE="${TEST_HOME}"/pr-body.txt
echo "${PR_BODY}" > "${PR_BODY_FILE}"
MODEL=`python ./.github/scripts/test-repeated-runs.py --pr-body "${PR_BODY_FILE}"`
fi
# Setup nvidia gpu frequency
sudo nvidia-persistenced --user "${USER}" || true
sudo nvidia-smi -pm "${GPU_ID}"
sudo nvidia-smi -ac "${GPU_FREQ}"
# Run the tests
EVAL_LOG="${TEST_HOME}/eval-${MODEL}.log"
echo -n > "${EVAL_LOG}"
for i in `seq 1 ${REPEAT}`; do
python run.py "${MODEL}" -t eval -d cuda | tee -a "${EVAL_LOG}"
done
TRAIN_LOG="${TEST_HOME}/train-${MODEL}.log"
echo -n > "${TRAIN_LOG}"
for i in `seq 1 ${REPEAT}`; do
python run.py "${MODEL}" -t train -d cuda | tee -a "${TRAIN_LOG}"
done
# Check the stability of GPU tests
python ./.github/scripts/test-repeated-runs.py --log "${EVAL_LOG}" && \
echo "GPU stability test pass for inference!"
python ./.github/scripts/test-repeated-runs.py --log "${TRAIN_LOG}" && \
echo "GPU stability test pass for train!"
- name: Remove conda environment
run: |
conda env remove --name "${CONDA_ENV}"