Skip to content

Commit

Permalink
Update smoke_test.py
Browse files Browse the repository at this point in the history
  • Loading branch information
atalman authored May 8, 2024
1 parent 52b55d6 commit f9b9dc8
Showing 1 changed file with 12 additions and 309 deletions.
321 changes: 12 additions & 309 deletions test/smoke_test/smoke_test.py
Original file line number Diff line number Diff line change
@@ -1,313 +1,16 @@
import os
import re
import sys
import argparse
import torch
import json
import importlib
import subprocess
import torch._dynamo
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path

gpu_arch_ver = os.getenv("MATRIX_GPU_ARCH_VERSION")
gpu_arch_type = os.getenv("MATRIX_GPU_ARCH_TYPE")
channel = os.getenv("MATRIX_CHANNEL")
package_type = os.getenv("MATRIX_PACKAGE_TYPE")
target_os = os.getenv("TARGET_OS")
BASE_DIR = Path(__file__).parent.parent.parent

is_cuda_system = gpu_arch_type == "cuda"
NIGHTLY_ALLOWED_DELTA = 3

MODULES = [
{
"name": "torchvision",
"repo": "https://github.com/pytorch/vision.git",
"smoke_test": "./vision/test/smoke_test.py",
"extension": "extension",
"repo_name": "vision",
},
{
"name": "torchaudio",
"repo": "https://github.com/pytorch/audio.git",
"smoke_test": "./audio/test/smoke_test/smoke_test.py --no-ffmpeg",
"extension": "_extension",
"repo_name": "audio",
},
]


class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.fc1 = nn.Linear(9216, 1)

def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = F.max_pool2d(x, 2)
x = torch.flatten(x, 1)
output = self.fc1(x)
return output

def load_json_from_basedir(filename: str):
try:
with open(BASE_DIR / filename) as fptr:
return json.load(fptr)
except FileNotFoundError as exc:
raise ImportError(f"File {filename} not found error: {exc.strerror}") from exc
except json.JSONDecodeError as exc:
raise ImportError(f"Invalid JSON {filename}") from exc

def read_release_matrix():
return load_json_from_basedir("release_matrix.json")

def check_version(package: str) -> None:
release_version = os.getenv("RELEASE_VERSION")
# if release_version is specified, use it to validate the packages
if(release_version):
release_matrix = read_release_matrix()
stable_version = release_matrix["torch"]
else:
stable_version = os.getenv("MATRIX_STABLE_VERSION")

# only makes sense to check nightly package where dates are known
if channel == "nightly":
check_nightly_binaries_date(package)
elif stable_version is not None:
if not torch.__version__.startswith(stable_version):
raise RuntimeError(
f"Torch version mismatch, expected {stable_version} for channel {channel}. But its {torch.__version__}"
)

if release_version and package == "all":
for module in MODULES:
imported_module = importlib.import_module(module["name"])
module_version = imported_module.__version__
if not module_version.startswith(release_matrix[module["name"]]):
raise RuntimeError(
f"{module['name']} version mismatch, expected: \
{release_matrix[module['name']]} for channel {channel}. But its {module_version}"
)
else:
print(f"{module['name']} version actual: {module_version} expected: \
{release_matrix[module['name']]} for channel {channel}.")
import re

cuda_exception_missed = True
try:
print("Testing test_cuda_runtime_errors_captured")
torch._assert_async(torch.tensor(0, device="cuda"))
torch._assert_async(torch.tensor(0 + 0j, device="cuda"))
except RuntimeError as e:
if re.search("CUDA", f"{e}"):
print(f"Caught CUDA exception with success: {e}")
cuda_exception_missed = False
else:
print(f"Skip version check for channel {channel} as stable version is None")


def check_nightly_binaries_date(package: str) -> None:
from datetime import datetime
format_dt = '%Y%m%d'

date_t_str = re.findall("dev\\d+", torch.__version__)
date_t_delta = datetime.now() - datetime.strptime(date_t_str[0][3:], format_dt)
if date_t_delta.days >= NIGHTLY_ALLOWED_DELTA:
raise RuntimeError(
f"the binaries are from {date_t_str} and are more than {NIGHTLY_ALLOWED_DELTA} days old!"
)

if package == "all":
for module in MODULES:
imported_module = importlib.import_module(module["name"])
module_version = imported_module.__version__
date_m_str = re.findall("dev\\d+", module_version)
date_m_delta = datetime.now() - datetime.strptime(date_m_str[0][3:], format_dt)
print(f"Nightly date check for {module['name']} version {module_version}")
if date_m_delta.days > NIGHTLY_ALLOWED_DELTA:
raise RuntimeError(
f"Expected {module['name']} to be less then {NIGHTLY_ALLOWED_DELTA} days. But its {date_m_delta}"
)


def test_cuda_runtime_errors_captured() -> None:
cuda_exception_missed = True
try:
print("Testing test_cuda_runtime_errors_captured")
torch._assert_async(torch.tensor(0, device="cuda"))
except RuntimeError as e:
if re.search("CUDA", f"{e}"):
print(f"Caught CUDA exception with success: {e}")
else:
raise e

raise e
if cuda_exception_missed:
raise RuntimeError("Expected CUDA RuntimeError but have not received!")


def smoke_test_cuda(package: str, runtime_error_check: str) -> None:
if not torch.cuda.is_available() and is_cuda_system:
raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.")

if package == 'all' and is_cuda_system:
for module in MODULES:
imported_module = importlib.import_module(module["name"])
# TBD for vision move extension module to private so it will
# be _extention.
version = "N/A"
if module["extension"] == "extension":
version = imported_module.extension._check_cuda_version()
else:
version = imported_module._extension._check_cuda_version()
print(f"{module['name']} CUDA: {version}")

# torch.compile is available on macos-arm64 and Linux for python 3.8-3.11
if sys.version_info < (3, 12, 0) and (
(target_os == "linux" and torch.cuda.is_available()) or
target_os == "macos-arm64"):
smoke_test_compile()

if torch.cuda.is_available():
if torch.version.cuda != gpu_arch_ver:
raise RuntimeError(
f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
)
print(f"torch cuda: {torch.version.cuda}")
# todo add cudnn version validation
print(f"torch cudnn: {torch.backends.cudnn.version()}")
print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")

# nccl is availbale only on Linux
if (sys.platform in ["linux", "linux2"]):
print(f"torch nccl version: {torch.cuda.nccl.version()}")


if runtime_error_check == "enabled":
test_cuda_runtime_errors_captured()


def smoke_test_conv2d() -> None:
import torch.nn as nn

print("Testing smoke_test_conv2d")
# With square kernels and equal stride
m = nn.Conv2d(16, 33, 3, stride=2)
# non-square kernels and unequal stride and with padding
m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
assert m is not None
# non-square kernels and unequal stride and with padding and dilation
basic_conv = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
input = torch.randn(20, 16, 50, 100)
output = basic_conv(input)

if is_cuda_system:
print("Testing smoke_test_conv2d with cuda")
conv = nn.Conv2d(3, 3, 3).cuda()
x = torch.randn(1, 3, 24, 24, device="cuda")
with torch.cuda.amp.autocast():
out = conv(x)
assert out is not None

supported_dtypes = [torch.float16, torch.float32, torch.float64]
for dtype in supported_dtypes:
print(f"Testing smoke_test_conv2d with cuda for {dtype}")
conv = basic_conv.to(dtype).cuda()
input = torch.randn(20, 16, 50, 100, device="cuda").type(dtype)
output = conv(input)
assert output is not None


def test_linalg(device="cpu") -> None:
print(f"Testing smoke_test_linalg on {device}")
A = torch.randn(5, 3, device=device)
U, S, Vh = torch.linalg.svd(A, full_matrices=False)
assert U.shape == A.shape and S.shape == torch.Size([3]) and Vh.shape == torch.Size([3, 3])
torch.dist(A, U @ torch.diag(S) @ Vh)

U, S, Vh = torch.linalg.svd(A)
assert U.shape == torch.Size([5, 5]) and S.shape == torch.Size([3]) and Vh.shape == torch.Size([3, 3])
torch.dist(A, U[:, :3] @ torch.diag(S) @ Vh)

A = torch.randn(7, 5, 3, device=device)
U, S, Vh = torch.linalg.svd(A, full_matrices=False)
torch.dist(A, U @ torch.diag_embed(S) @ Vh)

if device == "cuda":
supported_dtypes = [torch.float32, torch.float64]
for dtype in supported_dtypes:
print(f"Testing smoke_test_linalg with cuda for {dtype}")
A = torch.randn(20, 16, 50, 100, device=device, dtype=dtype)
torch.linalg.svd(A)


def smoke_test_compile() -> None:
supported_dtypes = [torch.float16, torch.float32, torch.float64]
dv = "cuda" if target_os == "linux" else "cpu"

def foo(x: torch.Tensor) -> torch.Tensor:
return torch.sin(x) + torch.cos(x)

for dtype in supported_dtypes:
print(f"Testing smoke_test_compile for {dtype}")
x = torch.rand(3, 3, device=dv).type(dtype)
x_eager = foo(x)
x_pt2 = torch.compile(foo)(x)
print(torch.allclose(x_eager, x_pt2))

# Reset torch dynamo since we are changing mode
torch._dynamo.reset()
dtype = torch.float32
torch.set_float32_matmul_precision('high')
print(f"Testing smoke_test_compile with mode 'max-autotune' for {dtype}")
x = torch.rand(64, 1, 28, 28, device=dv).type(torch.float32)
model = Net().to(device=dv)
x_pt2 = torch.compile(model, mode="max-autotune")(x)


def smoke_test_modules():
cwd = os.getcwd()
for module in MODULES:
if module["repo"]:
if not os.path.exists(f"{cwd}/{module['repo_name']}"):
print(f"Path does not exist: {cwd}/{module['repo_name']}")
try:
subprocess.check_output(
f"git clone --depth 1 {module['repo']}",
stderr=subprocess.STDOUT,
shell=True,
)
except subprocess.CalledProcessError as exc:
raise RuntimeError(
f"Cloning {module['repo']} FAIL: {exc.returncode} Output: {exc.output}"
) from exc
try:
smoke_test_command = f"python3 {module['smoke_test']}"
if target_os == 'windows':
smoke_test_command = f"python {module['smoke_test']}"
output = subprocess.check_output(
smoke_test_command, stderr=subprocess.STDOUT, shell=True,
universal_newlines=True)
except subprocess.CalledProcessError as exc:
raise RuntimeError(f"Module {module['name']} FAIL: {exc.returncode} Output: {exc.output}") from exc
else:
print(f"Output: \n{output}\n")


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--package",
help="Package to include in smoke testing",
type=str,
choices=["all", "torchonly"],
default="all",
)
parser.add_argument(
"--runtime-error-check",
help="No Runtime Error check",
type=str,
choices=["enabled", "disabled"],
default="enabled",
)
options = parser.parse_args()
print(f"torch: {torch.__version__}")

smoke_test_cuda(options.package, options.runtime_error_check)


if __name__ == "__main__":
main()

0 comments on commit f9b9dc8

Please sign in to comment.