diff --git a/.ci_support/linux_64_blas_implgenericc_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml b/.ci_support/linux_64_blas_implgenericc_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
index 08644b2e..7f3b2658 100644
--- a/.ci_support/linux_64_blas_implgenericc_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
+++ b/.ci_support/linux_64_blas_implgenericc_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
@@ -64,6 +64,9 @@ pytorch:
 - '2.4'
 target_platform:
 - linux-64
+use_magma:
+- 'false'
+- 'true'
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/.ci_support/linux_64_blas_implgenericc_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml b/.ci_support/linux_64_blas_implgenericc_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
index 7ffe87a0..69fc01db 100644
--- a/.ci_support/linux_64_blas_implgenericc_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
+++ b/.ci_support/linux_64_blas_implgenericc_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
@@ -64,6 +64,9 @@ pytorch:
 - '2.4'
 target_platform:
 - linux-64
+use_magma:
+- 'false'
+- 'true'
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/.ci_support/linux_64_blas_implmklc_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml b/.ci_support/linux_64_blas_implmklc_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
index 83ae6560..e0e75cc7 100644
--- a/.ci_support/linux_64_blas_implmklc_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
+++ b/.ci_support/linux_64_blas_implmklc_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
@@ -64,6 +64,9 @@ pytorch:
 - '2.4'
 target_platform:
 - linux-64
+use_magma:
+- 'false'
+- 'true'
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/.ci_support/linux_64_blas_implmklc_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml b/.ci_support/linux_64_blas_implmklc_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
index facbc7ca..c9b53e55 100644
--- a/.ci_support/linux_64_blas_implmklc_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
+++ b/.ci_support/linux_64_blas_implmklc_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
@@ -64,6 +64,9 @@ pytorch:
 - '2.4'
 target_platform:
 - linux-64
+use_magma:
+- 'false'
+- 'true'
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/.ci_support/linux_aarch64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml b/.ci_support/linux_aarch64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
index 3910b6e5..f0c8756e 100644
--- a/.ci_support/linux_aarch64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
+++ b/.ci_support/linux_aarch64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.6cxx_compiler_version12.yaml
@@ -64,6 +64,9 @@ pytorch:
 - '2.4'
 target_platform:
 - linux-aarch64
+use_magma:
+- 'false'
+- 'true'
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/.ci_support/linux_aarch64_c_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml b/.ci_support/linux_aarch64_c_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
index 4f899c39..0337c1b4 100644
--- a/.ci_support/linux_aarch64_c_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
+++ b/.ci_support/linux_aarch64_c_compiler_version13cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version13.yaml
@@ -64,6 +64,9 @@ pytorch:
 - '2.4'
 target_platform:
 - linux-aarch64
+use_magma:
+- 'false'
+- 'true'
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/README.md b/README.md
index 10844c99..85644a73 100644
--- a/README.md
+++ b/README.md
@@ -189,6 +189,7 @@ Current release info
 | Name | Downloads | Version | Platforms |
 | --- | --- | --- | --- |
 | [![Conda Recipe](https://img.shields.io/badge/recipe-libtorch-green.svg)](https://anaconda.org/conda-forge/libtorch) | [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/libtorch.svg)](https://anaconda.org/conda-forge/libtorch) | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/libtorch.svg)](https://anaconda.org/conda-forge/libtorch) | [![Conda Platforms](https://img.shields.io/conda/pn/conda-forge/libtorch.svg)](https://anaconda.org/conda-forge/libtorch) |
+| [![Conda Recipe](https://img.shields.io/badge/recipe-libtorch--cuda--linalg-green.svg)](https://anaconda.org/conda-forge/libtorch-cuda-linalg) | [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/libtorch-cuda-linalg.svg)](https://anaconda.org/conda-forge/libtorch-cuda-linalg) | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/libtorch-cuda-linalg.svg)](https://anaconda.org/conda-forge/libtorch-cuda-linalg) | [![Conda Platforms](https://img.shields.io/conda/pn/conda-forge/libtorch-cuda-linalg.svg)](https://anaconda.org/conda-forge/libtorch-cuda-linalg) |
 | [![Conda Recipe](https://img.shields.io/badge/recipe-pytorch-green.svg)](https://anaconda.org/conda-forge/pytorch) | [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/pytorch.svg)](https://anaconda.org/conda-forge/pytorch) | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/pytorch.svg)](https://anaconda.org/conda-forge/pytorch) | [![Conda Platforms](https://img.shields.io/conda/pn/conda-forge/pytorch.svg)](https://anaconda.org/conda-forge/pytorch) |
 | [![Conda Recipe](https://img.shields.io/badge/recipe-pytorch--cpu-green.svg)](https://anaconda.org/conda-forge/pytorch-cpu) | [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/pytorch-cpu.svg)](https://anaconda.org/conda-forge/pytorch-cpu) | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/pytorch-cpu.svg)](https://anaconda.org/conda-forge/pytorch-cpu) | [![Conda Platforms](https://img.shields.io/conda/pn/conda-forge/pytorch-cpu.svg)](https://anaconda.org/conda-forge/pytorch-cpu) |
 | [![Conda Recipe](https://img.shields.io/badge/recipe-pytorch--gpu-green.svg)](https://anaconda.org/conda-forge/pytorch-gpu) | [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/pytorch-gpu.svg)](https://anaconda.org/conda-forge/pytorch-gpu) | [![Conda Version](https://img.shields.io/conda/vn/conda-forge/pytorch-gpu.svg)](https://anaconda.org/conda-forge/pytorch-gpu) | [![Conda Platforms](https://img.shields.io/conda/pn/conda-forge/pytorch-gpu.svg)](https://anaconda.org/conda-forge/pytorch-gpu) |
@@ -203,16 +204,16 @@ conda config --add channels conda-forge
 conda config --set channel_priority strict
 ```
 
-Once the `conda-forge` channel has been enabled, `libtorch, pytorch, pytorch-cpu, pytorch-gpu` can be installed with `conda`:
+Once the `conda-forge` channel has been enabled, `libtorch, libtorch-cuda-linalg, pytorch, pytorch-cpu, pytorch-gpu` can be installed with `conda`:
 
 ```
-conda install libtorch pytorch pytorch-cpu pytorch-gpu
+conda install libtorch libtorch-cuda-linalg pytorch pytorch-cpu pytorch-gpu
 ```
 
 or with `mamba`:
 
 ```
-mamba install libtorch pytorch pytorch-cpu pytorch-gpu
+mamba install libtorch libtorch-cuda-linalg pytorch pytorch-cpu pytorch-gpu
 ```
 
 It is possible to list all of the versions of `libtorch` available on your platform with `conda`:
diff --git a/recipe/README.md b/recipe/README.md
index 1a953a8d..cae82de3 100644
--- a/recipe/README.md
+++ b/recipe/README.md
@@ -83,7 +83,7 @@ of 2024-11-28:
 | magma     | 2.6.1          |        | 2.8.0       | `.ci/docker/common/instal_magma.sh` |
 | libabseil | indirect?      |        | 20240722.0  |                                     |
 | libuv     |                |        | 1.49.2      | (not pinned)                        |
-| mkl       | 2024.2.0       |        | 2024.2.2    | `.ci/docker/common/install_mkl.sh`  |
+| mkl       | 2024.2.0       | <2024  | 2023.2.0    | `.ci/docker/common/install_mkl.sh`  |
 | nccl      | 2.21.5+        |        | 2.23.4.1    | `third_party/nccl/nccl`             |
 | protobuf  | 3.7.0rc2+      |        | 5.28.2      | `third_party/protobuf`              |
 | sleef     | 3.6+           |        | 3.7         | `third_party/sleef`                 |
@@ -98,3 +98,92 @@ of 2024-11-28:
 | sympy     | ==1.13.1       | >=1.13.1, !=1.13.2 | 1.13.3 | (wheel metadata)             |
 | typing-extensions | >=4.8.0 |       | 4.12.2      | (wheel metadata)                    |
 | triton    | 3.1.0          | none   | 3.1.0       | (wheel metadata)                    |
+
+
+Maintenance notes
+=================
+
+Packages built by the recipe
+----------------------------
+The recipe currently builds four packages:
+
+1. `libtorch` that installs the common libraries, executables and data files
+   that are independent of selected Python version and are therefore shared
+   by all Python versions.
+
+2. `libtorch-cuda-linalg` that provides the shared `libtorch_cuda_linalg.so`
+   library, in variant linked to `magma` or not, and is built only for
+   GPU-enabled variants.
+
+3. `pytorch` that installs the library and other files for a specific Python
+   version.
+
+4. `pytorch-cpu` or `pytorch-gpu` backwards compatibility metapackage.
+
+These packages can be built in the following variants:
+
+- `cpu` variant that does not use CUDA, or `cuda` variant built using
+  specific CUDA version (`libtorch-cuda-linalg` is built only in `cuda`
+  variants).
+
+- `mkl` variant that uses MKL to provide BLAS/LAPACK, as well as a set
+  of additional functions, and `generic` variant that can use any BLAS/LAPACK
+  provider (created by patching on OpenBLAS support upstream).
+
+Additionally, `libtorch-cuda-linalg` can be built in `magma` or `nomagma`
+variant. The former links against libmagma, while the latter avoids this
+significant dependency. Both versions support the built-in cuSOLVER backend,
+and the `magma` version normally uses a heuristic to choose between them,
+in order to achieve the best performance for a given operation.
+
+Some of the platforms support only a subset of these variants.
+
+The recipe supports a `megabuild` mode that is currently used for Linux
+configurations. In this mode, PyTorch is built for all Python versions
+in a single run. As a result, the shared bits (`libtorch*`) are only built once.
+
+As the `megabuild` mode imposes high disk space requirements on the CI builders,
+it is not used on other platforms currently. For this reason, there are separate
+configurations for every Python version there.
+
+
+The build process
+-----------------
+The upstream build system consists of a heavily customize `setup.py` script,
+based on the setuptools build system that performs some preparations related
+to building C++ code and then calls into CMake to build it (i.e. it's not
+suitable to use CMake directly). The build process can be customized using
+environment variables, some of them processed directly by the setup script,
+others converted into `-D` options for CMake. When looking for available
+options, `setup.py` and `tools/setup_helpers/cmake.py` are the two primary
+files to look at.
+
+Normally, the setup code only runs the `cmake` generate step if `CMakeCache.txt`
+does not exist yet. Therefore, on subsequent calls environment variables do not
+affect the CMake build. It is technically possible to force rerunning it via
+appending `--cmake` option, but that usually causes the build system to consider
+all targets out of date, and therefore rebuild everything from scratch. Instead,
+we are editing `CMakeCache.txt` directly, therefore triggering the build step
+to detect changes and regenerate.
+
+To facilitate split package builds, we perform the build in the following steps:
+
+1. For the top-level rule (`libtorch-split`), we perform the base environment
+   setup and run `setup.py build` to build the libraries and collect the data
+   files without actually installing them. Then we move the files we need
+   into temporary directories for repackaging.
+
+   a. If `megabuild` is enabled, we build against a fixed Python version.
+      Otherwise, we build using the final Python version.
+
+   b. If CUDA support is enabled, we build with `magma` disabled first.
+      Then we copy the resulting library, and rebuild with `magma` enabled.
+      This way, we obtain the two version of the library to repackage.
+
+2. For the `libtorch` and `libtorch-cuda-linalg` packages, we manually install
+   files that were prepared earlier.
+
+3. For the final `pytorch` package(s), we invoke `pip install` to build
+   and install the complete package. Importantly, this reuses previously built
+   targets, so only Python-related bits are rebuilt. In `megabuild` mode,
+   we patch `CMakeCache.txt` to set the correct Python version.
diff --git a/recipe/bld.bat b/recipe/bld.bat
index 30cc5d4f..a00ffaff 100644
--- a/recipe/bld.bat
+++ b/recipe/bld.bat
@@ -1,47 +1,3 @@
 @echo On
 
-set TH_BINARY_BUILD=1
-set PYTORCH_BUILD_VERSION=%PKG_VERSION%
-set PYTORCH_BUILD_NUMBER=%PKG_BUILDNUM%
-
-if "%pytorch_variant%" == "gpu" (
-    set build_with_cuda=1
-    set desired_cuda=%CUDA_VERSION:~0,-1%.%CUDA_VERSION:~-1,1%
-) else (
-    set build_with_cuda=
-    set USE_CUDA=0
-)
-
-if "%build_with_cuda%" == "" goto cuda_flags_end
-
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%desired_cuda%
-set CUDA_BIN_PATH=%CUDA_PATH%\bin
-set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX
-set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
-
-:cuda_flags_end
-
-set DISTUTILS_USE_SDK=1
-
-set CMAKE_INCLUDE_PATH=%LIBRARY_PREFIX%\include
-set LIB=%LIBRARY_PREFIX%\lib;%LIB%
-
-IF "%build_with_cuda%" == "" goto cuda_end
-
-set MAGMA_HOME=%LIBRARY_PREFIX%
-
-set "PATH=%CUDA_BIN_PATH%;%PATH%"
-
-set CUDNN_INCLUDE_DIR=%LIBRARY_PREFIX%\include
-
-:cuda_end
-
-set CMAKE_GENERATOR=Ninja
-set "CMAKE_GENERATOR_PLATFORM="
-set "CMAKE_PREFIX_PATH=%LIBRARY_PREFIX%"
-set "libuv_ROOT=%LIBRARY_PREFIX%"
-set "USE_SYSTEM_SLEEF=OFF"
-set "BUILD_CUSTOM_PROTOBUF=OFF"
-
-%PYTHON% -m pip install . --no-deps -vv
-if errorlevel 1 exit /b 1
+call %RECIPE_DIR%\build_common.bat
diff --git a/recipe/build.sh b/recipe/build.sh
index 9a12aeb1..8a9603bf 100644
--- a/recipe/build.sh
+++ b/recipe/build.sh
@@ -1,229 +1,3 @@
-#!/bin/bash
-
-set -ex
-
-# This is used to detect if it's in the process of building pytorch
-export IN_PYTORCH_BUILD=1
-
-# https://github.com/conda-forge/pytorch-cpu-feedstock/issues/243
-# https://github.com/pytorch/pytorch/blob/v2.3.1/setup.py#L341
-export PACKAGE_TYPE=conda
-
-# remove pyproject.toml to avoid installing deps from pip
-rm -rf pyproject.toml
-
-# uncomment to debug cmake build
-# export CMAKE_VERBOSE_MAKEFILE=1
-
-export USE_CUFILE=0
-export USE_NUMA=0
-export USE_ITT=0
-export CFLAGS="$(echo $CFLAGS | sed 's/-fvisibility-inlines-hidden//g')"
-export CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fvisibility-inlines-hidden//g')"
-export LDFLAGS="$(echo $LDFLAGS | sed 's/-Wl,--as-needed//g')"
-export LDFLAGS="$(echo $LDFLAGS | sed 's/-Wl,-dead_strip_dylibs//g')"
-export LDFLAGS_LD="$(echo $LDFLAGS_LD | sed 's/-dead_strip_dylibs//g')"
-if [[ "$c_compiler" == "clang" ]]; then
-    export CXXFLAGS="$CXXFLAGS -Wno-deprecated-declarations -Wno-unknown-warning-option -Wno-error=unused-command-line-argument -Wno-error=vla-cxx-extension"
-    export CFLAGS="$CFLAGS -Wno-deprecated-declarations -Wno-unknown-warning-option -Wno-error=unused-command-line-argument -Wno-error=vla-cxx-extension"
-else
-    export CXXFLAGS="$CXXFLAGS -Wno-deprecated-declarations -Wno-error=maybe-uninitialized"
-    export CFLAGS="$CFLAGS -Wno-deprecated-declarations -Wno-error=maybe-uninitialized"
-fi
-
-# This is not correctly found for linux-aarch64 since pytorch 2.0.0 for some reason
-export _GLIBCXX_USE_CXX11_ABI=1
-
-# KINETO seems to require CUPTI and will look quite hard for it.
-# CUPTI seems to cause trouble when users install a version of
-# cudatoolkit different than the one specified at compile time.
-# https://github.com/conda-forge/pytorch-cpu-feedstock/issues/135
-export USE_KINETO=OFF
-
-if [[ "$target_platform" == "osx-64" ]]; then
-  export CXXFLAGS="$CXXFLAGS -DTARGET_OS_OSX=1"
-  export CFLAGS="$CFLAGS -DTARGET_OS_OSX=1"
-fi
-
-# Dynamic libraries need to be lazily loaded so that torch
-# can be imported on system without a GPU
-LDFLAGS="${LDFLAGS//-Wl,-z,now/-Wl,-z,lazy}"
-
-export CMAKE_GENERATOR=Ninja
-export CMAKE_LIBRARY_PATH=$PREFIX/lib:$PREFIX/include:$CMAKE_LIBRARY_PATH
-export CMAKE_PREFIX_PATH=$PREFIX
-export CMAKE_BUILD_TYPE=Release
-
-for ARG in $CMAKE_ARGS; do
-  if [[ "$ARG" == "-DCMAKE_"* ]]; then
-    cmake_arg=$(echo $ARG | cut -d= -f1)
-    cmake_arg=$(echo $cmake_arg| cut -dD -f2-)
-    cmake_val=$(echo $ARG | cut -d= -f2-)
-    printf -v $cmake_arg "$cmake_val"
-    export ${cmake_arg}
-  fi
-done
-CMAKE_FIND_ROOT_PATH+=";$SRC_DIR"
-unset CMAKE_INSTALL_PREFIX
-export TH_BINARY_BUILD=1
-export PYTORCH_BUILD_VERSION=$PKG_VERSION
-export PYTORCH_BUILD_NUMBER=$PKG_BUILDNUM
-
-export INSTALL_TEST=0
-export BUILD_TEST=0
-
-export USE_SYSTEM_SLEEF=1
-# use our protobuf
-export BUILD_CUSTOM_PROTOBUF=OFF
-rm -rf $PREFIX/bin/protoc
-
-# prevent six from being downloaded
-> third_party/NNPACK/cmake/DownloadSix.cmake
-
-if [[ "${target_platform}" != "${build_platform}" ]]; then
-    # It helps cross compiled builds without emulation support to complete
-    # Use BUILD PREFIX protoc instead of the one that is from the host platform
-    sed -i.bak \
-        "s,IMPORTED_LOCATION_RELEASE .*/bin/protoc,IMPORTED_LOCATION_RELEASE \"${BUILD_PREFIX}/bin/protoc," \
-        ${PREFIX}/lib/cmake/protobuf/protobuf-targets-release.cmake
-fi
-
-# I don't know where this folder comes from, but it's interfering with the build in osx-64
-rm -rf $PREFIX/git
-
-if [[ "$CONDA_BUILD_CROSS_COMPILATION" == 1 ]]; then
-    export COMPILER_WORKS_EXITCODE=0
-    export COMPILER_WORKS_EXITCODE__TRYRUN_OUTPUT=""
-fi
-
-if [[ "${CI}" == "github_actions" ]]; then
-    # h-vetinari/hmaarrfk -- May 2024
-    # reduce parallelism to avoid getting OOM-killed on
-    # cirun-openstack-gpu-2xlarge, which has 32GB RAM, 8 CPUs
-    export MAX_JOBS=4
-else
-    export MAX_JOBS=${CPU_COUNT}
-fi
-
-if [[ "$blas_impl" == "generic" ]]; then
-    # Fake openblas
-    export BLAS=OpenBLAS
-    sed -i.bak "s#FIND_LIBRARY.*#set(OpenBLAS_LIB ${PREFIX}/lib/liblapack${SHLIB_EXT} ${PREFIX}/lib/libcblas${SHLIB_EXT} ${PREFIX}/lib/libblas${SHLIB_EXT})#g" cmake/Modules/FindOpenBLAS.cmake
-else
-    export BLAS=MKL
-fi
-
-if [[ "$PKG_NAME" == "pytorch" ]]; then
-  PIP_ACTION=install
-  # Trick Cmake into thinking python hasn't changed
-  sed "s/3\.12/$PY_VER/g" build/CMakeCache.txt.orig > build/CMakeCache.txt
-  sed -i.bak "s/3;12/${PY_VER%.*};${PY_VER#*.}/g" build/CMakeCache.txt
-  sed -i.bak "s/cpython-312/cpython-${PY_VER%.*}${PY_VER#*.}/g" build/CMakeCache.txt
-else
-  # For the main script we just build a wheel for so that the C++/CUDA
-  # parts are built. Then they are reused in each python version.
-  PIP_ACTION=wheel
-fi
-
-# MacOS build is simple, and will not be for CUDA
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    # Produce macOS builds with torch.distributed support.
-    # This is enabled by default on Linux, but disabled by default on macOS,
-    # because it requires an non-bundled compile-time dependency (libuv
-    # through gloo). This dependency is made available through meta.yaml, so
-    # we can override the default and set USE_DISTRIBUTED=1.
-    export USE_DISTRIBUTED=1
-
-    if [[ "$target_platform" == "osx-arm64" ]]; then
-        # MKLDNN did not support on Apple M1 at the time support Apple M1
-        # was added. Revisit later
-        export USE_MKLDNN=0
-    fi
-elif [[ ${cuda_compiler_version} != "None" ]]; then
-    if [[ "$target_platform" == "linux-aarch64" ]]; then
-        # https://github.com/pytorch/pytorch/pull/121975
-        # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/264
-        export USE_PRIORITIZED_TEXT_FOR_LD=1
-    fi
-    # Even though cudnn is used for CUDA builds, it's good to enable
-    # for MKLDNN for CUDA builds when CUDA builds are used on a machine
-    # with no NVIDIA GPUs.
-    export USE_MKLDNN=1
-    export USE_CUDA=1
-    export USE_CUFILE=1
-    # PyTorch has multiple different bits of logic finding CUDA, override
-    # all of them.
-    export CUDAToolkit_BIN_DIR=${BUILD_PREFIX}/bin
-    export CUDAToolkit_ROOT_DIR=${PREFIX}
-    if [[ "${target_platform}" != "${build_platform}" ]]; then
-        export CUDA_TOOLKIT_ROOT=${PREFIX}
-    fi
-    case ${target_platform} in
-        linux-64)
-            export CUDAToolkit_TARGET_DIR=${PREFIX}/targets/x86_64-linux
-            ;;
-        linux-aarch64)
-            export CUDAToolkit_TARGET_DIR=${PREFIX}/targets/sbsa-linux
-            ;;
-        *)
-            echo "unknown CUDA arch, edit build.sh"
-            exit 1
-    esac
-    case ${cuda_compiler_version} in
-        12.6)
-            export TORCH_CUDA_ARCH_LIST="5.0;6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX"
-            ;;
-        *)
-            echo "unsupported cuda version. edit build.sh"
-            exit 1
-    esac
-    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
-    export NCCL_ROOT_DIR=$PREFIX
-    export NCCL_INCLUDE_DIR=$PREFIX/include
-    export USE_SYSTEM_NCCL=1
-    export USE_STATIC_NCCL=0
-    export USE_STATIC_CUDNN=0
-    export MAGMA_HOME="${PREFIX}"
-else
-    if [[ "$target_platform" != *-64 ]]; then
-      # Breakpad seems to not work on aarch64 or ppc64le
-      # https://github.com/pytorch/pytorch/issues/67083
-      export USE_BREAKPAD=0
-    fi
-    # MKLDNN is an Apache-2.0 licensed library for DNNs and is used
-    # for CPU builds. Not to be confused with MKL.
-    export USE_MKLDNN=1
-    export USE_CUDA=0
-fi
-
-echo '${CXX}'=${CXX}
-echo '${PREFIX}'=${PREFIX}
-$PREFIX/bin/python -m pip $PIP_ACTION . --no-deps -vvv --no-clean \
-    | sed "s,${CXX},\$\{CXX\},g" \
-    | sed "s,${PREFIX},\$\{PREFIX\},g"
-
-if [[ "$PKG_NAME" == "libtorch" ]]; then
-  mkdir -p $SRC_DIR/dist
-  pushd $SRC_DIR/dist
-  wheel unpack ../torch-*.whl
-  pushd torch-*
-  mv torch/bin/* ${PREFIX}/bin
-  mv torch/lib/* ${PREFIX}/lib
-  mv torch/share/* ${PREFIX}/share
-  for f in ATen caffe2 tensorpipe torch c10; do
-    mv torch/include/$f ${PREFIX}/include/$f
-  done
-  rm ${PREFIX}/lib/libtorch_python.*
-  popd
-  popd
-
-  # Keep the original backed up to sed later
-  cp build/CMakeCache.txt build/CMakeCache.txt.orig
-else
-  # Keep this in ${PREFIX}/lib so that the library can be found by
-  # TorchConfig.cmake.
-  # With upstream non-split build, `libtorch_python.so`
-  # and TorchConfig.cmake are both in ${SP_DIR}/torch/lib and therefore
-  # this is not needed.
-  mv ${SP_DIR}/torch/lib/libtorch_python${SHLIB_EXT} ${PREFIX}/lib
-fi
+# we are using a separate file here to avoid conda-build thinking that
+# magma is used in top level package build
+source $RECIPE_DIR/build_common.sh
diff --git a/recipe/build_common.bat b/recipe/build_common.bat
new file mode 100644
index 00000000..30cc5d4f
--- /dev/null
+++ b/recipe/build_common.bat
@@ -0,0 +1,47 @@
+@echo On
+
+set TH_BINARY_BUILD=1
+set PYTORCH_BUILD_VERSION=%PKG_VERSION%
+set PYTORCH_BUILD_NUMBER=%PKG_BUILDNUM%
+
+if "%pytorch_variant%" == "gpu" (
+    set build_with_cuda=1
+    set desired_cuda=%CUDA_VERSION:~0,-1%.%CUDA_VERSION:~-1,1%
+) else (
+    set build_with_cuda=
+    set USE_CUDA=0
+)
+
+if "%build_with_cuda%" == "" goto cuda_flags_end
+
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%desired_cuda%
+set CUDA_BIN_PATH=%CUDA_PATH%\bin
+set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX
+set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+
+:cuda_flags_end
+
+set DISTUTILS_USE_SDK=1
+
+set CMAKE_INCLUDE_PATH=%LIBRARY_PREFIX%\include
+set LIB=%LIBRARY_PREFIX%\lib;%LIB%
+
+IF "%build_with_cuda%" == "" goto cuda_end
+
+set MAGMA_HOME=%LIBRARY_PREFIX%
+
+set "PATH=%CUDA_BIN_PATH%;%PATH%"
+
+set CUDNN_INCLUDE_DIR=%LIBRARY_PREFIX%\include
+
+:cuda_end
+
+set CMAKE_GENERATOR=Ninja
+set "CMAKE_GENERATOR_PLATFORM="
+set "CMAKE_PREFIX_PATH=%LIBRARY_PREFIX%"
+set "libuv_ROOT=%LIBRARY_PREFIX%"
+set "USE_SYSTEM_SLEEF=OFF"
+set "BUILD_CUSTOM_PROTOBUF=OFF"
+
+%PYTHON% -m pip install . --no-deps -vv
+if errorlevel 1 exit /b 1
diff --git a/recipe/build_common.sh b/recipe/build_common.sh
new file mode 100644
index 00000000..ef72c019
--- /dev/null
+++ b/recipe/build_common.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+
+echo "=== Building ${PKG_NAME} (magma: ${use_magma}; py: ${PY_VER}) ==="
+
+set -ex
+
+# This is used to detect if it's in the process of building pytorch
+export IN_PYTORCH_BUILD=1
+
+# https://github.com/conda-forge/pytorch-cpu-feedstock/issues/243
+# https://github.com/pytorch/pytorch/blob/v2.3.1/setup.py#L341
+export PACKAGE_TYPE=conda
+
+# remove pyproject.toml to avoid installing deps from pip
+rm -rf pyproject.toml
+
+# uncomment to debug cmake build
+# export CMAKE_VERBOSE_MAKEFILE=1
+
+export USE_CUFILE=0
+export USE_NUMA=0
+export USE_ITT=0
+export CFLAGS="$(echo $CFLAGS | sed 's/-fvisibility-inlines-hidden//g')"
+export CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fvisibility-inlines-hidden//g')"
+export LDFLAGS="$(echo $LDFLAGS | sed 's/-Wl,--as-needed//g')"
+export LDFLAGS="$(echo $LDFLAGS | sed 's/-Wl,-dead_strip_dylibs//g')"
+export LDFLAGS_LD="$(echo $LDFLAGS_LD | sed 's/-dead_strip_dylibs//g')"
+if [[ "$c_compiler" == "clang" ]]; then
+    export CXXFLAGS="$CXXFLAGS -Wno-deprecated-declarations -Wno-unknown-warning-option -Wno-error=unused-command-line-argument -Wno-error=vla-cxx-extension"
+    export CFLAGS="$CFLAGS -Wno-deprecated-declarations -Wno-unknown-warning-option -Wno-error=unused-command-line-argument -Wno-error=vla-cxx-extension"
+else
+    export CXXFLAGS="$CXXFLAGS -Wno-deprecated-declarations -Wno-error=maybe-uninitialized"
+    export CFLAGS="$CFLAGS -Wno-deprecated-declarations -Wno-error=maybe-uninitialized"
+fi
+
+# This is not correctly found for linux-aarch64 since pytorch 2.0.0 for some reason
+export _GLIBCXX_USE_CXX11_ABI=1
+
+# KINETO seems to require CUPTI and will look quite hard for it.
+# CUPTI seems to cause trouble when users install a version of
+# cudatoolkit different than the one specified at compile time.
+# https://github.com/conda-forge/pytorch-cpu-feedstock/issues/135
+export USE_KINETO=OFF
+
+if [[ "$target_platform" == "osx-64" ]]; then
+  export CXXFLAGS="$CXXFLAGS -DTARGET_OS_OSX=1"
+  export CFLAGS="$CFLAGS -DTARGET_OS_OSX=1"
+fi
+
+# Dynamic libraries need to be lazily loaded so that torch
+# can be imported on system without a GPU
+LDFLAGS="${LDFLAGS//-Wl,-z,now/-Wl,-z,lazy}"
+
+export CMAKE_GENERATOR=Ninja
+export CMAKE_LIBRARY_PATH=$PREFIX/lib:$PREFIX/include:$CMAKE_LIBRARY_PATH
+export CMAKE_PREFIX_PATH=$PREFIX
+export CMAKE_BUILD_TYPE=Release
+
+for ARG in $CMAKE_ARGS; do
+  if [[ "$ARG" == "-DCMAKE_"* ]]; then
+    cmake_arg=$(echo $ARG | cut -d= -f1)
+    cmake_arg=$(echo $cmake_arg| cut -dD -f2-)
+    cmake_val=$(echo $ARG | cut -d= -f2-)
+    printf -v $cmake_arg "$cmake_val"
+    export ${cmake_arg}
+  fi
+done
+CMAKE_FIND_ROOT_PATH+=";$SRC_DIR"
+unset CMAKE_INSTALL_PREFIX
+export TH_BINARY_BUILD=1
+export PYTORCH_BUILD_VERSION=$PKG_VERSION
+export PYTORCH_BUILD_NUMBER=$PKG_BUILDNUM
+
+export INSTALL_TEST=0
+export BUILD_TEST=0
+
+export USE_SYSTEM_SLEEF=1
+# use our protobuf
+export BUILD_CUSTOM_PROTOBUF=OFF
+rm -rf $PREFIX/bin/protoc
+
+# prevent six from being downloaded
+> third_party/NNPACK/cmake/DownloadSix.cmake
+
+if [[ "${target_platform}" != "${build_platform}" ]]; then
+    # It helps cross compiled builds without emulation support to complete
+    # Use BUILD PREFIX protoc instead of the one that is from the host platform
+    sed -i.bak \
+        "s,IMPORTED_LOCATION_RELEASE .*/bin/protoc,IMPORTED_LOCATION_RELEASE \"${BUILD_PREFIX}/bin/protoc," \
+        ${PREFIX}/lib/cmake/protobuf/protobuf-targets-release.cmake
+fi
+
+# I don't know where this folder comes from, but it's interfering with the build in osx-64
+rm -rf $PREFIX/git
+
+if [[ "$CONDA_BUILD_CROSS_COMPILATION" == 1 ]]; then
+    export COMPILER_WORKS_EXITCODE=0
+    export COMPILER_WORKS_EXITCODE__TRYRUN_OUTPUT=""
+fi
+
+if [[ "${CI}" == "github_actions" ]]; then
+    # h-vetinari/hmaarrfk -- May 2024
+    # reduce parallelism to avoid getting OOM-killed on
+    # cirun-openstack-gpu-2xlarge, which has 32GB RAM, 8 CPUs
+    export MAX_JOBS=4
+else
+    export MAX_JOBS=${CPU_COUNT}
+fi
+
+if [[ "$blas_impl" == "generic" ]]; then
+    # Fake openblas
+    export BLAS=OpenBLAS
+    sed -i.bak "s#FIND_LIBRARY.*#set(OpenBLAS_LIB ${PREFIX}/lib/liblapack${SHLIB_EXT} ${PREFIX}/lib/libcblas${SHLIB_EXT} ${PREFIX}/lib/libblas${SHLIB_EXT})#g" cmake/Modules/FindOpenBLAS.cmake
+else
+    export BLAS=MKL
+fi
+
+if [[ "$PKG_NAME" == "pytorch" ]]; then
+  # Trick Cmake into thinking python hasn't changed
+  sed "s/3\.12/$PY_VER/g" build/CMakeCache.txt.orig > build/CMakeCache.txt
+  sed -i.bak "s/3;12/${PY_VER%.*};${PY_VER#*.}/g" build/CMakeCache.txt
+  sed -i.bak "s/cpython-312/cpython-${PY_VER%.*}${PY_VER#*.}/g" build/CMakeCache.txt
+fi
+
+# MacOS build is simple, and will not be for CUDA
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # Produce macOS builds with torch.distributed support.
+    # This is enabled by default on Linux, but disabled by default on macOS,
+    # because it requires an non-bundled compile-time dependency (libuv
+    # through gloo). This dependency is made available through meta.yaml, so
+    # we can override the default and set USE_DISTRIBUTED=1.
+    export USE_DISTRIBUTED=1
+
+    if [[ "$target_platform" == "osx-arm64" ]]; then
+        # MKLDNN did not support on Apple M1 at the time support Apple M1
+        # was added. Revisit later
+        export USE_MKLDNN=0
+    fi
+elif [[ ${cuda_compiler_version} != "None" ]]; then
+    if [[ "$target_platform" == "linux-aarch64" ]]; then
+        # https://github.com/pytorch/pytorch/pull/121975
+        # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/264
+        export USE_PRIORITIZED_TEXT_FOR_LD=1
+    fi
+    # Even though cudnn is used for CUDA builds, it's good to enable
+    # for MKLDNN for CUDA builds when CUDA builds are used on a machine
+    # with no NVIDIA GPUs.
+    export USE_MKLDNN=1
+    export USE_CUDA=1
+    export USE_CUFILE=1
+    # PyTorch has multiple different bits of logic finding CUDA, override
+    # all of them.
+    export CUDAToolkit_BIN_DIR=${BUILD_PREFIX}/bin
+    export CUDAToolkit_ROOT_DIR=${PREFIX}
+    if [[ "${target_platform}" != "${build_platform}" ]]; then
+        export CUDA_TOOLKIT_ROOT=${PREFIX}
+    fi
+    case ${target_platform} in
+        linux-64)
+            export CUDAToolkit_TARGET_DIR=${PREFIX}/targets/x86_64-linux
+            ;;
+        linux-aarch64)
+            export CUDAToolkit_TARGET_DIR=${PREFIX}/targets/sbsa-linux
+            ;;
+        *)
+            echo "unknown CUDA arch, edit build.sh"
+            exit 1
+    esac
+    case ${cuda_compiler_version} in
+        12.6)
+            export TORCH_CUDA_ARCH_LIST="5.0;6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX"
+            ;;
+        *)
+            echo "unsupported cuda version. edit build.sh"
+            exit 1
+    esac
+    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+    export NCCL_ROOT_DIR=$PREFIX
+    export NCCL_INCLUDE_DIR=$PREFIX/include
+    export USE_SYSTEM_NCCL=1
+    export USE_STATIC_NCCL=0
+    export USE_STATIC_CUDNN=0
+    export MAGMA_HOME="${PREFIX}"
+    # Perform the initial build without magma enabled, we'll enable
+    # it for the remaining builds (particularly, to have it enabled
+    # for pytorch).
+    export USE_MAGMA=0
+else
+    if [[ "$target_platform" != *-64 ]]; then
+      # Breakpad seems to not work on aarch64 or ppc64le
+      # https://github.com/pytorch/pytorch/issues/67083
+      export USE_BREAKPAD=0
+    fi
+    # MKLDNN is an Apache-2.0 licensed library for DNNs and is used
+    # for CPU builds. Not to be confused with MKL.
+    export USE_MKLDNN=1
+    export USE_CUDA=0
+fi
+
+echo '${CXX}'=${CXX}
+echo '${PREFIX}'=${PREFIX}
+
+case ${PKG_NAME} in
+  libtorch-split)
+    # Call setup.py directly to avoid spending time on unnecessarily
+    # packing and unpacking the wheel.
+    $PREFIX/bin/python setup.py build
+
+    mkdir -p dist-libtorch/include dist-libtorch-cuda-linalg-{magma,nomagma}/lib
+    mv build/lib.*/torch/{bin,lib,share} dist-libtorch/
+    mv build/lib.*/torch/include/{ATen,caffe2,tensorpipe,torch,c10} dist-libtorch/include/
+    rm dist-libtorch/lib/libtorch_python.*
+    if [[ ${cuda_compiler_version} != "None" ]]; then
+        mv dist-libtorch/lib/libtorch_cuda_linalg.* dist-libtorch-cuda-linalg-nomagma/lib/
+
+        # Now rebuild with magma enabled.
+        sed -i -e "/USE_MAGMA/s:=.*:=1:" build/CMakeCache.txt
+        $PREFIX/bin/python setup.py build
+        mv build/lib.*/torch/lib/libtorch_cuda_linalg.* dist-libtorch-cuda-linalg-magma/lib/
+    fi
+
+    # Keep the original backed up to sed later
+    cp build/CMakeCache.txt build/CMakeCache.txt.orig
+    ;;
+  libtorch)
+    mv dist-libtorch/bin/* ${PREFIX}/bin/
+    mv dist-libtorch/lib/* ${PREFIX}/lib/
+    mv dist-libtorch/share/* ${PREFIX}/share/
+    mv dist-libtorch/include/* ${PREFIX}/include/
+    ;;
+  libtorch-cuda-linalg)
+    if [[ ${use_magma} == true ]]; then
+      mv dist-libtorch-cuda-linalg-magma/lib/* ${PREFIX}/lib/
+    else
+      mv dist-libtorch-cuda-linalg-nomagma/lib/* ${PREFIX}/lib/
+    fi
+    ;;
+  pytorch)
+    $PREFIX/bin/python -m pip install . --no-deps -vvv --no-clean \
+        | sed "s,${CXX},\$\{CXX\},g" \
+        | sed "s,${PREFIX},\$\{PREFIX\},g"
+    # Keep this in ${PREFIX}/lib so that the library can be found by
+    # TorchConfig.cmake.
+    # With upstream non-split build, `libtorch_python.so`
+    # and TorchConfig.cmake are both in ${SP_DIR}/torch/lib and therefore
+    # this is not needed.
+    #
+    # NB: we are using cp rather than mv, so that the loop below symlinks it
+    # back.
+    cp ${SP_DIR}/torch/lib/libtorch_python${SHLIB_EXT} ${PREFIX}/lib
+
+    pushd $SP_DIR/torch
+    # Make symlinks for libraries and headers from libtorch into $SP_DIR/torch
+    # Also remove the vendorered libraries they seem to include
+    # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/243
+    # https://github.com/pytorch/pytorch/blob/v2.3.1/setup.py#L341
+    for f in bin/* lib/* share/* include/*; do
+      if [[ -e "$PREFIX/$f" ]]; then
+        rm -rf $f
+        ln -sf $PREFIX/$f $PWD/$f
+      fi
+    done
+    popd
+    ;;
+  *)
+    echo "Unknown package name, edit build.sh"
+    exit 1
+esac
diff --git a/recipe/build_pytorch.sh b/recipe/build_pytorch.sh
index 9f2eab82..6e5fa09b 100644
--- a/recipe/build_pytorch.sh
+++ b/recipe/build_pytorch.sh
@@ -1,17 +1 @@
-set -x
 source $RECIPE_DIR/build.sh
-
-# if $SP_DIR/torch doesn't exist here, the installation
-# of pytorch (see build_libtorch.sh call above) failed
-pushd $SP_DIR/torch
-
-# Make symlinks for libraries and headers from libtorch into $SP_DIR/torch
-# Also remove the vendorered libraries they seem to include
-# https://github.com/conda-forge/pytorch-cpu-feedstock/issues/243
-# https://github.com/pytorch/pytorch/blob/v2.3.1/setup.py#L341
-for f in bin/* lib/* share/* include/*; do
-  if [[ -e "$PREFIX/$f" ]]; then
-    rm -rf $f
-    ln -sf $PREFIX/$f $PWD/$f
-  fi
-done
diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml
index 86840e97..89891379 100644
--- a/recipe/conda_build_config.yaml
+++ b/recipe/conda_build_config.yaml
@@ -22,3 +22,7 @@ github_actions_labels:          # [linux]
 megabuild:
 - true      # [linux]
 - false     # [osx]
+
+use_magma:
+- true
+- false
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index bea64c3a..c48cb00d 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -9,8 +9,15 @@
 {% set build = build + 100 %}
 {% endif %}
 
+{% set mkl = "<2024" %}
+
+# a reasonably safe subset of tests that should run under 15 minutes
+# disable hypothesis because it randomly yields health check errors
+{% set tests = "test/test_autograd.py test/test_autograd_fallback.py test/test_custom_ops.py test/test_linalg.py test/test_mkldnn.py test/test_modules.py test/test_nn.py test/test_torch.py test/test_xnnpack_integration.py --deselect test/test_torch.py::TestTorch::test_print -m 'not hypothesis'" %}
+{% set tests_3_13_deselects = "--deselect test/test_custom_ops.py::TestCustomOp::test_data_dependent_compile --deselect test/test_custom_ops.py::TestCustomOp::test_functionalize_error --deselect test/test_custom_ops.py::TestCustomOpAPI::test_compile --deselect test/test_custom_ops.py::TestCustomOpAPI::test_fake -k 'not test_compile_int4_mm and not test_compile_int8_mm'" %}
+
 package:
-  name: libtorch
+  name: libtorch-split
   version: {{ version }}
 
 source:
@@ -35,21 +42,14 @@ source:
     - patches/0009-Allow-libcufile-for-conda-builds.patch
     # conda-specific patch, lets us override CUDA paths
     - patches/0010-Allow-overriding-CUDA-related-paths.patch
+    # NumPy 2 fixes:
+    # https://github.com/pytorch/pytorch/pull/136800
+    - patches/0011-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
+    # https://github.com/pytorch/pytorch/pull/137740
+    - patches/0012-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
 
 build:
   number: {{ build }}
-  string: cuda{{ cuda_compiler_version | replace('.', '') }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
-  string: cpu_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                 # [cuda_compiler_version == "None"]
-  detect_binary_files_with_prefix: false
-  run_exports:
-    - {{ pin_subpackage('libtorch', max_pin='x.x') }}
-  ignore_run_exports_from:
-    - python *                               # [megabuild]
-    - numpy *                                # [megabuild]
-    - cross-python_{{ target_platform }}     # [megabuild and build_platform != target_platform]
-  ignore_run_exports:
-    - python *                               # [megabuild]
-    - numpy *                                # [megabuild]
   skip: true  # [win]
   # cuda 11.8 was dropped due to maintenance effort, see discussion in #177
   skip: true  # [cuda_compiler_version == "11.8"]
@@ -125,54 +125,221 @@ requirements:
     - libuv
     - pkg-config  # [unix]
     - typing_extensions
-  run:
-    # GPU requirements without run_exports
-    - {{ pin_compatible('cudnn') }}                       # [cuda_compiler_version != "None"]
-  run_constrained:
-    # These constraints ensure conflict between pytorch and
-    # pytorch-cpu 1.1 which we built before conda-forge had GPU infrastructure
-    # built into place.
-    # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/65
-    - pytorch-cpu =={{ version }}  # [cuda_compiler_version == "None"]
-    - pytorch-gpu ==99999999       # [cuda_compiler_version == "None"]
-    - pytorch-gpu =={{ version }}  # [cuda_compiler_version != "None"]
-    - pytorch-cpu ==99999999       # [cuda_compiler_version != "None"]
-    - pytorch {{ version }} cuda{{ cuda_compiler_version | replace('.', '') }}_*_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
-    - pytorch {{ version }} cpu_{{ blas_impl }}_*_{{ PKG_BUILDNUM }}                                 # [cuda_compiler_version == "None"]
-    # See following link for sysroot consraint addition
-    # https://github.com/conda-forge/pytorch-cpu-feedstock/pull/293#issuecomment-2503611320
-    # 2024/12 hmaarrfk's summary:
-    #   The medium term solution is to add such a constraint to libcufile
-    #   The long term solution is to add such a constraint to all packages
-    #   that depend on a specific sysroot at building.
-    - sysroot_{{ target_platform }} >={{ c_stdlib_version }}
-
-# these tests are for the libtorch output below, but due to
-# a particularity of conda-build, that output is defined in
-# the global build stage, including tests
-test:
-  commands:
-    # libraries
-    {% for each_lib in [ 'libc10', 'libshm', 'libtorch', 'libtorch_cpu', 'libtorch_global_deps'] %}
-    - test -f $PREFIX/lib/{{ each_lib }}.so     # [linux]
-    - test -f $PREFIX/lib/{{ each_lib }}.dylib  # [osx]
-    {% endfor %}
-    {% for each_lib in ['libc10_cuda', 'libcaffe2_nvrtc', 'libtorch_cuda', 'libtorch_cuda_linalg'] %}
-    - test -f $PREFIX/lib/{{ each_lib }}.so     # [linux and cuda_compiler_version != "None"]
-    {% endfor %}
 
 outputs:
+  - name: libtorch-cuda-linalg
+    build:
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_magma_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}    # [use_magma]
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_nomagma_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [not use_magma]
+      skip: true  # [win or cuda_compiler_version == "None"]
+      ignore_run_exports_from:
+        - python *                               # [megabuild]
+        - numpy *                                # [megabuild]
+        - cross-python_{{ target_platform }}     # [megabuild and build_platform != target_platform]
+      ignore_run_exports:
+        - python *                               # [megabuild]
+        - numpy *                                # [megabuild]
+      track_features:
+        - nomagma                                # [not use_magma]
+    script: build_common.sh     # [unix]
+    script: build_common.bat    # [win]
+    requirements:
+      build:
+        # When you change 3.12 here, change it in build.sh as well
+        - python 3.12                            # [megabuild and build_platform != target_platform]
+        - python                                 # [not megabuild and build_platform != target_platform]
+        - cross-python_{{ target_platform }}     # [build_platform != target_platform]
+        - numpy  *                               # [megabuild and build_platform != target_platform]
+        - numpy                                  # [not megabuild and build_platform != target_platform]
+        - {{ stdlib('c') }}
+        - {{ compiler('c') }}
+        - {{ compiler('cxx') }}
+        - {{ compiler('cuda') }}
+        - patch
+        - git
+        - libgomp
+        - cmake
+        - ninja
+        # Keep libprotobuf here so that a compatibile version
+        # of protobuf is installed between build and host
+        - libprotobuf
+        - protobuf
+        - make
+      host:
+        # GPU requirements
+        - cudnn
+        - nccl
+        - cuda-version {{ cuda_compiler_version }}
+        - nvtx-c
+        - cuda-driver-dev
+        - cuda-cudart-dev
+        - cuda-nvrtc-dev
+        - cuda-nvtx-dev
+        - cuda-nvml-dev
+        - cuda-profiler-api
+        - libcublas-dev
+        - libcufile-dev
+        - libcufft-dev
+        - libcurand-dev
+        - libcusolver-dev
+        - libcusparse-dev
+        - magma        # [use_magma]
+        # other requirements
+        - python 3.12  # [megabuild]
+        - python       # [not megabuild]
+        - numpy *      # [megabuild]
+        - numpy        # [not megabuild]
+        - pip
+        - setuptools
+        - pyyaml
+        - requests
+        - six
+        - mkl-devel {{ mkl }}   # [blas_impl == "mkl"]
+        - libcblas * *_mkl      # [blas_impl == "mkl"]
+        - libcblas              # [blas_impl != "mkl"]
+        - liblapack             # [blas_impl != "mkl"]
+        - libgomp
+        - libabseil
+        - libprotobuf
+        - sleef
+        - libuv
+        - pkg-config
+        - typing_extensions
+      run:
+        # GPU requirements without run_exports
+        - {{ pin_compatible('cudnn') }}
+      run_constrained:
+        - magma <0.0a0  # [not use_magma]
+
   - name: libtorch
+    build:
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
+      string: cpu_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                 # [cuda_compiler_version == "None"]
+      detect_binary_files_with_prefix: false
+      run_exports:
+        - {{ pin_subpackage('libtorch', max_pin='x.x') }}
+      ignore_run_exports_from:
+        - magma *
+        - python *                               # [megabuild]
+        - numpy *                                # [megabuild]
+        - cross-python_{{ target_platform }}     # [megabuild and build_platform != target_platform]
+      ignore_run_exports:
+        - python *                               # [megabuild]
+        - numpy *                                # [megabuild]
+    script: build.sh   # [unix]
+    script: bld.bat    # [win]
+    requirements:
+      build:
+        # When you change 3.12 here, change it in build.sh as well
+        - python 3.12                            # [megabuild and build_platform != target_platform]
+        - python                                 # [not megabuild and build_platform != target_platform]
+        - cross-python_{{ target_platform }}     # [build_platform != target_platform]
+        - numpy  *                               # [megabuild and build_platform != target_platform]
+        - numpy                                  # [not megabuild and build_platform != target_platform]
+        - {{ stdlib('c') }}
+        - {{ compiler('c') }}
+        - {{ compiler('cxx') }}
+        - {{ compiler('cuda') }}                 # [cuda_compiler_version != "None"]
+        # Dec 2020: it seems that git is broken on windows, so we use m2-git
+        - m2-patch  # [win]
+        - m2-git    # [win]
+        - patch     # [not win]
+        - git       # [not win]
+        - libgomp        # [linux]
+        - llvm-openmp    # [osx]
+        - cmake
+        - ninja
+        # Keep libprotobuf here so that a compatibile version
+        # of protobuf is installed between build and host
+        - libprotobuf
+        - protobuf
+        - make      # [linux]
+      host:
+        # GPU requirements
+        - cudnn                           # [cuda_compiler_version != "None"]
+        - nccl                            # [cuda_compiler_version != "None"]
+        - cuda-version {{ cuda_compiler_version }}  # [cuda_compiler_version != "None"]
+        - nvtx-c                          # [cuda_compiler_version != "None"]
+        {% if cuda_compiler_version != "None" %}
+        - cuda-driver-dev
+        - cuda-cudart-dev
+        - cuda-nvrtc-dev
+        - cuda-nvtx-dev
+        - cuda-nvml-dev
+        - cuda-profiler-api
+        - libcublas-dev
+        - libcufile-dev
+        - libcufft-dev
+        - libcurand-dev
+        - libcusolver-dev
+        - libcusparse-dev
+        {% endif %}
+        # other requirements
+        - python 3.12  # [megabuild]
+        - python       # [not megabuild]
+        - numpy *      # [megabuild]
+        - numpy        # [not megabuild]
+        - pip
+        - setuptools
+        - pyyaml
+        - requests
+        - six
+        - mkl-devel {{ mkl }}   # [blas_impl == "mkl"]
+        - libcblas * *_mkl      # [blas_impl == "mkl"]
+        - libcblas              # [blas_impl != "mkl"]
+        - liblapack             # [blas_impl != "mkl"]
+        - libgomp   # [linux]
+        - llvm-openmp    # [osx]
+        - libabseil
+        - libprotobuf
+        - sleef
+        - libuv
+        - pkg-config  # [unix]
+        - typing_extensions
+        - libtorch-cuda-linalg {{ version }}                  # [cuda_compiler_version != "None"]
+      run:
+        - libtorch-cuda-linalg {{ version }}                  # [cuda_compiler_version != "None"]
+        # GPU requirements without run_exports
+        - {{ pin_compatible('cudnn') }}                       # [cuda_compiler_version != "None"]
+      run_constrained:
+        # These constraints ensure conflict between pytorch and
+        # pytorch-cpu 1.1 which we built before conda-forge had GPU infrastructure
+        # built into place.
+        # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/65
+        - pytorch-cpu =={{ version }}  # [cuda_compiler_version == "None"]
+        - pytorch-gpu ==99999999       # [cuda_compiler_version == "None"]
+        - pytorch-gpu =={{ version }}  # [cuda_compiler_version != "None"]
+        - pytorch-cpu ==99999999       # [cuda_compiler_version != "None"]
+        - pytorch {{ version }} cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_*_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
+        - pytorch {{ version }} cpu_{{ blas_impl }}_*_{{ PKG_BUILDNUM }}                                                 # [cuda_compiler_version == "None"]
+        # See following link for sysroot consraint addition
+        # https://github.com/conda-forge/pytorch-cpu-feedstock/pull/293#issuecomment-2503611320
+        # 2024/12 hmaarrfk's summary:
+        #   The medium term solution is to add such a constraint to libcufile
+        #   The long term solution is to add such a constraint to all packages
+        #   that depend on a specific sysroot at building.
+        - sysroot_{{ target_platform }} >={{ c_stdlib_version|default(0) }}
+    test:
+      commands:
+        # libraries
+        {% for each_lib in [ 'libc10', 'libshm', 'libtorch', 'libtorch_cpu', 'libtorch_global_deps'] %}
+        - test -f $PREFIX/lib/{{ each_lib }}.so     # [linux]
+        - test -f $PREFIX/lib/{{ each_lib }}.dylib  # [osx]
+        {% endfor %}
+        {% for each_lib in ['libc10_cuda', 'libcaffe2_nvrtc', 'libtorch_cuda'] %}
+        - test -f $PREFIX/lib/{{ each_lib }}.so     # [linux and cuda_compiler_version != "None"]
+        {% endfor %}
+
   - name: pytorch
     build:
-      string: cuda{{ cuda_compiler_version | replace('.', '') }}_py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
-      string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                 # [cuda_compiler_version == "None"]
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [cuda_compiler_version != "None"]
+      string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                 # [cuda_compiler_version == "None"]
       detect_binary_files_with_prefix: false
+      ignore_run_exports_from:
+        - magma
       run_exports:
         - {{ pin_subpackage('pytorch', max_pin='x.x') }}
         - {{ pin_subpackage('libtorch', max_pin='x.x') }}
-      skip: true  # [win]
-      skip: true  # [cuda_compiler_version != "None" and linux64 and blas_impl != "mkl"]
 
     script: build_pytorch.sh   # [unix]
     script: build_pytorch.bat  # [win]
@@ -203,9 +370,9 @@ outputs:
         # GPU requirements
         - cudnn                           # [cuda_compiler_version != "None"]
         - nccl                            # [cuda_compiler_version != "None"]
-        - magma                           # [cuda_compiler_version != "None"]
         - cuda-version {{ cuda_compiler_version }}  # [cuda_compiler_version != "None"]
         - nvtx-c                          # [cuda_compiler_version != "None"]
+        - magma                           # [cuda_compiler_version != "None"]
         {% if cuda_compiler_version != "None" %}
         - cuda-driver-dev
         - cuda-cudart-dev
@@ -274,15 +441,18 @@ outputs:
       requires:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
+        - ninja
         - boto3
         - hypothesis
         - pytest
         - tabulate
         - pydot
-        - mock  # [linux]
         - pip
         - expecttest
         - xmlrunner
+        - pytest-flakefinder
+        - pytest-rerunfailures
+        - pytest-xdist
       imports:
         - torch  # [not (aarch64 and cuda_compiler_version != "None")]
       source_files:
@@ -291,8 +461,13 @@ outputs:
         # as of pytorch=2.0.0, there is a bug when trying to run tests without the tools
         - tools
       commands:
-        - OMP_NUM_THREADS=4 python ./test/run_test.py || true  # [not win and not (aarch64 and cuda_compiler_version != "None")]
-        - python ./test/run_test.py  # [win]
+        # the whole test suite takes forever, but we should get a good enough coverage
+        # for potential packaging problems by running a fixed subset
+        - OMP_NUM_THREADS=4 python -m pytest -n auto {{ tests }}  # [not win and not (aarch64 and cuda_compiler_version != "None") and py != 313]
+        - python -m pytest -n auto {{ tests }}  # [win and py != 313]
+        # dynamo does not support python 3.13
+        - OMP_NUM_THREADS=4 python -m pytest -n auto {{ tests_3_13_deselects }} {{ tests }}  # [not win and not (aarch64 and cuda_compiler_version != "None") and py == 313]
+        - python -m pytest -n auto {{ tests_3_13_deselects }} {{ tests }}  # [win and py == 313]
         # Run pip check so as to ensure that all pytorch packages are installed
         # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/24
         - pip check
@@ -314,18 +489,17 @@ outputs:
   {% set pytorch_cpu_gpu = "pytorch-gpu" %}   # [cuda_compiler_version != "None"]
   - name: {{ pytorch_cpu_gpu }}
     build:
-      string: cuda{{ cuda_compiler_version | replace('.', '') }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                  # [megabuild and cuda_compiler_version != "None"]
-      string: cpu_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                # [megabuild and cuda_compiler_version == "None"]
-      string: cuda{{ cuda_compiler_version | replace('.', '') }}py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [not megabuild and cuda_compiler_version != "None"]
-      string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                # [not megabuild and cuda_compiler_version == "None"]
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                  # [megabuild and cuda_compiler_version != "None"]
+      string: cpu_{{ blas_impl }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                                # [megabuild and cuda_compiler_version == "None"]
+      string: cuda{{ cuda_compiler_version | replace('.', '') }}_{{ blas_impl }}py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}  # [not megabuild and cuda_compiler_version != "None"]
+      string: cpu_{{ blas_impl }}_py{{ CONDA_PY }}_h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}                                                # [not megabuild and cuda_compiler_version == "None"]
       detect_binary_files_with_prefix: false
-      skip: true  # [cuda_compiler_version != "None" and linux64 and blas_impl != "mkl"]
       # weigh down cpu implementation and give cuda preference
       track_features:
         - pytorch-cpu                                      # [cuda_compiler_version == "None"]
     requirements:
       run:
-        - pytorch {{ version }}=cuda*{{ PKG_BUILDNUM }}                   # [megabuild and cuda_compiler_version != "None"]
+        - pytorch {{ version }}=cuda_{{ blas_impl }}*{{ PKG_BUILDNUM }}   # [megabuild and cuda_compiler_version != "None"]
         - pytorch {{ version }}=cpu_{{ blas_impl }}*{{ PKG_BUILDNUM }}    # [megabuild and cuda_compiler_version == "None"]
         - {{ pin_subpackage("pytorch", exact=True) }}                     # [not megabuild]
     test:
diff --git a/recipe/patches/0011-Fix-test-test_linalg.py-for-NumPy-2-136800.patch b/recipe/patches/0011-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
new file mode 100644
index 00000000..1247841b
--- /dev/null
+++ b/recipe/patches/0011-Fix-test-test_linalg.py-for-NumPy-2-136800.patch
@@ -0,0 +1,80 @@
+From f5c485918fac838d989c5aa1d4dcc6651273eacd Mon Sep 17 00:00:00 2001
+From: Haifeng Jin <haifeng-jin@users.noreply.github.com>
+Date: Tue, 1 Oct 2024 07:53:24 +0000
+Subject: [PATCH 4/5] Fix test/test_linalg.py for NumPy 2 (#136800)
+
+Related to  #107302.
+
+When built and tested with NumPy 2 the following unit tests failed.
+
+```
+=========================================================== short test summary info ============================================================
+FAILED [0.0026s] test/test_linalg.py::TestLinalgCPU::test_householder_product_cpu_complex128 - TypeError: expected np.ndarray (got Tensor)
+FAILED [0.0024s] test/test_linalg.py::TestLinalgCPU::test_householder_product_cpu_complex64 - TypeError: expected np.ndarray (got Tensor)
+FAILED [0.0025s] test/test_linalg.py::TestLinalgCPU::test_householder_product_cpu_float32 - TypeError: expected np.ndarray (got Tensor)
+FAILED [0.0024s] test/test_linalg.py::TestLinalgCPU::test_householder_product_cpu_float64 - TypeError: expected np.ndarray (got Tensor)
+FAILED [0.0016s] test/test_linalg.py::TestLinalgCPU::test_nuclear_norm_axes_small_brute_force_old_cpu - ValueError: Unable to avoid copy while creating an array as requested.
+FAILED [0.0054s] test/test_linalg.py::TestLinalgCPU::test_solve_cpu_complex128 - AssertionError: The values for attribute 'shape' do not match: torch.Size([0, 0]) != torch.Size([0, 0, 0]).
+FAILED [0.0055s] test/test_linalg.py::TestLinalgCPU::test_solve_cpu_complex64 - AssertionError: The values for attribute 'shape' do not match: torch.Size([0, 0]) != torch.Size([0, 0, 0]).
+FAILED [0.0048s] test/test_linalg.py::TestLinalgCPU::test_solve_cpu_float32 - AssertionError: The values for attribute 'shape' do not match: torch.Size([0, 0]) != torch.Size([0, 0, 0]).
+FAILED [0.0054s] test/test_linalg.py::TestLinalgCPU::test_solve_cpu_float64 - AssertionError: The values for attribute 'shape' do not match: torch.Size([0, 0]) != torch.Size([0, 0, 0]).
+=========================================== 9 failed, 1051 passed, 118 skipped in 152.51s (0:02:32) ============================================
+```
+
+This PR fixes them. The test is now compatible with both NumPy 1 & 2.
+
+Some more details:
+
+1. The `np.linalg.solve` has changed its behavior. So I added an adapt function in the unit test to keep its behavior the same no matter it is NumPy 1 or Numpy 2.
+2. The cause of the failure is when passing a `torch.Tensor` to `np.linalg.qr`, the return type in NumPy 1 is `(np.ndarray, np.ndarray)`, while it is `(torch.Tensor, torch.Tensor)` in NumPy 2.
+3. NumPy 2 does not allow `np.array(obj, copy=False)`, but recommended to use `np.asarray(obj)` instead.
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/136800
+Approved by: https://github.com/lezcano
+---
+ test/test_linalg.py | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+diff --git a/test/test_linalg.py b/test/test_linalg.py
+index e9ec874d6..060bccef2 100644
+--- a/test/test_linalg.py
++++ b/test/test_linalg.py
+@@ -2351,7 +2351,7 @@ class TestLinalg(TestCase):
+             if self.device_type != 'cpu' and randrange(100) < 95:
+                 return  # too many cpu <==> device copies
+ 
+-            a = np.array(x.cpu(), copy=False)
++            a = np.asarray(x.cpu())
+             expected = np.linalg.norm(a, "nuc", axis=axes)
+ 
+             ans = torch.norm(x, "nuc", dim=axes)
+@@ -3082,7 +3082,14 @@ class TestLinalg(TestCase):
+             self.assertEqual(b.expand_as(Ax), Ax)
+ 
+             # Check against NumPy
+-            expected = np.linalg.solve(A.cpu().numpy(), b.expand_as(x).cpu().numpy())
++            if rhs == ():
++                # In NumPy 2, "b" can no longer be a vector (i.e. rhs == ()) if has batch dimensions.
++                # So, reshape it to a matrix and back. Related documentation:
++                # https://numpy.org/doc/1.26/reference/generated/numpy.linalg.solve.html
++                # https://numpy.org/doc/2.0/reference/generated/numpy.linalg.solve.html
++                expected = np.linalg.solve(A.cpu().numpy(), b.cpu().numpy().reshape(*b.shape, 1)).reshape(b.shape)
++            else:
++                expected = np.linalg.solve(A.cpu().numpy(), b.cpu().numpy())
+             self.assertEqual(x, expected)
+ 
+         batches = [(), (0, ), (3, ), (2, 3)]
+@@ -5234,7 +5241,9 @@ class TestLinalg(TestCase):
+                 tau_shape = [*A_cpu.shape[:-2], A_cpu.shape[-1]]
+                 tau = torch.empty(tau_shape, dtype=dtype).view(-1, A_cpu.shape[-1])
+                 for A_i, reflectors_i, tau_i in zip(A_cpu.contiguous().view(*flattened_batch_shape), reflectors, tau):
+-                    reflectors_tmp, tau_i[:] = map(torch.from_numpy, np.linalg.qr(A_i, mode='raw'))
++                    reflectors_tmp, tau_i[:] = (
++                        torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in np.linalg.qr(A_i, mode='raw')
++                    )
+                     reflectors_i[:] = reflectors_tmp.T
+                 reflectors = reflectors.view(*A_cpu.shape)
+                 tau = tau.view(tau_shape)
+-- 
+2.47.1
+
diff --git a/recipe/patches/0012-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch b/recipe/patches/0012-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
new file mode 100644
index 00000000..3ecdfec0
--- /dev/null
+++ b/recipe/patches/0012-Fixes-NumPy-2-test-failures-in-test_torch.py-137740.patch
@@ -0,0 +1,63 @@
+From e9d0fadc19ca7677a4598f8fbbf03721667a64bf Mon Sep 17 00:00:00 2001
+From: Haifeng Jin <haifeng-jin@users.noreply.github.com>
+Date: Sat, 12 Oct 2024 02:40:17 +0000
+Subject: [PATCH 5/5] Fixes NumPy 2 test failures in test_torch.py (#137740)
+
+Related to #107302
+
+The breakages are caused by backward incompatibility between NumPy 1 and NumPy 2.
+This PR fixes all the corresponding test failures in `test_torch.py`.
+
+1. The dtype of the return value `np.percentile` when passed a `torch.float32` tensor.
+NumPy 1: Return value of `np.float64`.
+NumPy 2: Return value of `np.float32`.
+Solution: Enforce it with `.astype(np.float64)`.
+
+2. The type of `np.gradient()` when returning multiple arrays.
+NumPy1: A list of arrays.
+NumPy2: A tuple of arrays.
+Solution: Cast the tuple to a list.
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/137740
+Approved by: https://github.com/ezyang
+---
+ test/test_torch.py | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/test/test_torch.py b/test/test_torch.py
+index be4d61808..c6fd6ac9f 100644
+--- a/test/test_torch.py
++++ b/test/test_torch.py
+@@ -2891,7 +2891,7 @@ else:
+ 
+     # if the given input arg is not a list, it returns a list of single element: [arg]
+     def _wrap_to_list(self, input_array):
+-        return input_array if isinstance(input_array, list) else [input_array]
++        return list(input_array) if isinstance(input_array, (list, tuple)) else [input_array]
+ 
+     # To ensure inf, -inf, and nan values do not cause divergence between Numpy and PyTorch.
+     # There are two types of possible divergence:
+@@ -3029,7 +3029,7 @@ else:
+                     # Result is given just as real number and all the imaginary parts to be equal to zero.
+                     self.assertEqual(expected[i].imag, torch.zeros(actual[i].shape), exact_dtype=False)
+             else:
+-                actual, expected = self._inf_nan_preprocess(list(actual), expected)
++                actual, expected = self._inf_nan_preprocess(list(actual), list(expected))
+                 self.assertEqual(actual, expected, equal_nan=True, exact_dtype=False)
+ 
+     @onlyNativeDeviceTypes
+@@ -7549,10 +7549,10 @@ class TestTorch(TestCase):
+             torch.mean(sample, dim=0), torch.full((d,), 0.5), atol=2, rtol=2
+         )
+         torch.testing.assert_close(
+-            np.percentile(sample, 25, axis=0), np.repeat(0.25, d), atol=2, rtol=2
++            np.percentile(sample, 25, axis=0).astype(np.float64), np.repeat(0.25, d), atol=2, rtol=2
+         )
+         torch.testing.assert_close(
+-            np.percentile(sample, 75, axis=0), np.repeat(0.75, d), atol=2, rtol=2
++            np.percentile(sample, 75, axis=0).astype(np.float64), np.repeat(0.75, d), atol=2, rtol=2
+         )
+ 
+     @skipIfTorchDynamo("np.float64 restored as float32 after graph break.")
+-- 
+2.47.1
+