facebookresearch · danthe3rd · Nov 15, 2022 · Nov 11, 2022 · Nov 11, 2022 · Nov 11, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -81,6 +81,7 @@ setup_conda: &setup_conda
 install_dep: &install_dep
   - run:
       name: Install Dependencies with torch nightly
+      no_output_timeout: 30m
       command: |
         source $BASH_ENV
 
@@ -94,7 +95,7 @@ install_dep: &install_dep
         conda install ninja
         echo "Ninja version $(ninja --version)"
 
-        conda install pytorch=1.12.1 "torchvision>=0.13" torchaudio cudatoolkit=11.3 -c pytorch -q
+        conda install pytorch=1.13 torchvision torchaudio pytorch-cuda=11.6 -c pytorch -c nvidia -q
         $CONDA_PYTHON -m pip install -r requirements-benchmark.txt --progress-bar off
 
         # Mark install as complete
@@ -109,7 +110,7 @@ install_dep_exp: &install_dep_exp
         if [ -f /home/circleci/venv/check_version.py ]; then $CONDA_PYTHON  /home/circleci/venv/check_version.py torch gt 1.11 && exit 0; fi
         # start installing
         source activate /home/circleci/venv
-        conda install pytorch=1.12.1 "torchvision>=0.13" torchaudio cudatoolkit=11.3 -c pytorch -q
+        conda install pytorch=1.13 torchvision torchaudio pytorch-cuda=11.6 -c pytorch -c nvidia -q
         $CONDA_PYTHON -m pip install -r experimental/requirements.txt --progress-bar off
 
 install_repo: &install_repo
@@ -158,6 +159,7 @@ run_mypy: &run_mypy
        when: always
        command: |
         source $BASH_ENV
+        $CONDA_PYTHON -m mypy --version
         $CONDA_PYTHON -m mypy --ignore-missing-imports --scripts-are-modules --pretty --exclude "(build|stubs|third_party|docs|setup.py)" .
 
 run_flake8: &run_flake8
@@ -375,6 +377,8 @@ jobs:
     parameters:
       dockerimage:
         type: string
+      pytorch_version:
+        type: string
       cu_version:
         type: string
 
@@ -390,14 +394,17 @@ jobs:
 
     steps:
       - checkout
-
+      - run: git submodule update --init --recursive
       - run:
-          name: conda build
+          name: conda build for py3_9
           no_output_timeout: 20m
           command: |
-            git submodule update --init --recursive
-            python packaging/conda/build_conda.py  --cuda << parameters.cu_version >> --python 3.9 --pytorch 1.12.1 --upload-dev
-            python packaging/conda/build_conda.py  --cuda << parameters.cu_version >> --python 3.10 --pytorch 1.12.1 --upload-dev
+            python packaging/conda/build_conda.py --cuda << parameters.cu_version >> --python 3.9 --pytorch << parameters.pytorch_version >>
+      - run:
+          name: conda build for py3_10
+          no_output_timeout: 20m
+          command: |
+            python packaging/conda/build_conda.py --cuda << parameters.cu_version >> --python 3.10 --pytorch << parameters.pytorch_version >>
 
   gpu_tests_cu114_sm75:
     <<: *gpu_cu114
@@ -542,21 +549,25 @@ workflows:
               - gh-pages
 
       - build_conda:
-          name: conda_build_cu113
+          name: conda_build_cu113_1.12.1
           dockerimage: pytorch/conda-builder:cuda113
           cu_version: "11.3"
-          filters:
-            branches:
-              only:
-                - main
+          pytorch_version: "1.12.1"
       - build_conda:
-          name: conda_build_cu116
+          name: conda_build_cu116_1.12.1
           dockerimage: pytorch/conda-builder:cuda116
           cu_version: "11.6"
-          filters:
-            branches:
-              only:
-                - main
+          pytorch_version: "1.12.1"
+      - build_conda:
+          name: conda_build_cu116_1.13
+          dockerimage: pytorch/conda-builder:cuda116
+          cu_version: "11.6"
+          pytorch_version: "1.13"
+      - build_conda:
+          name: conda_build_cu117_1.13
+          dockerimage: pytorch/conda-builder:cuda117
+          cu_version: "11.7"
+          pytorch_version: "1.13"
       - binary_linux_wheel:
           python_version: "3.7"
           name: binary_linux_wheel_py37_cu102

diff --git a/packaging/conda/build_conda.py b/packaging/conda/build_conda.py
@@ -16,6 +16,7 @@
     "1.11.0": ["10.2", "11.1", "11.3", "11.5"],
     "1.12.0": ["10.2", "11.3", "11.6"],
     "1.12.1": ["10.2", "11.3", "11.6"],
+    "1.13": ["11.6", "11.7"],
 }
 
 
@@ -33,6 +34,8 @@ def conda_docker_image_for_cuda(cuda_version):
         return "pytorch/conda-builder:cuda115"
     if cuda_version == "11.6":
         return "pytorch/conda-builder:cuda116"
+    if cuda_version == "11.7":
+        return "pytorch/conda-builder:cuda117"
     raise ValueError(f"Unknown cuda version {cuda_version}")
 
 
@@ -94,9 +97,12 @@ def _set_env_for_build(self):
         os.environ["PYTORCH_VERSION"] = self.pytorch_version
         os.environ["CU_VERSION"] = self.cuda_version
         os.environ["SOURCE_ROOT_DIR"] = str(SOURCE_ROOT_DIR)
-        os.environ["CONDA_CUDATOOLKIT_CONSTRAINT"] = version_constraint(
-            self.cuda_version
-        )
+        cuda_constraint = version_constraint(self.cuda_version)
+        pytorch_version_tuple = tuple(int(v) for v in self.pytorch_version.split("."))
+        if pytorch_version_tuple < (1, 13):
+            os.environ["CONDA_CUDA_CONSTRAINT"] = f"cudatoolkit{cuda_constraint}"
+        else:
+            os.environ["CONDA_CUDA_CONSTRAINT"] = f"pytorch-cuda{cuda_constraint}"
         os.environ["FORCE_CUDA"] = "1"
 
         if self.conda_always_copy:
@@ -107,7 +113,9 @@ def _get_build_args(self):
             "conda",
             "build",
             "-c",
-            "fastchan",  # which can avoid needing pytorch and conda-forge
+            "pytorch",
+            "-c",
+            "nvidia",
             "--no-anaconda-upload",
             "--python",
             self.python_version,

diff --git a/packaging/conda/xformers/meta.yaml b/packaging/conda/xformers/meta.yaml
@@ -8,21 +8,22 @@ source:
 requirements:
   build:
     - ninja
+    - pytorch=={{ environ.get('PYTORCH_VERSION') }}
   host:
     # - numpy >=1.11
     - python
     - pytorch=={{ environ.get('PYTORCH_VERSION') }}
-    - cudatoolkit{{ environ['CONDA_CUDATOOLKIT_CONSTRAINT'] }}
+    - {{environ['CONDA_CUDA_CONSTRAINT']}}
 
   run:
     # - numpy >=1.11
     - python
     - pytorch=={{ environ.get('PYTORCH_VERSION') }}
-    - cudatoolkit{{ environ['CONDA_CUDATOOLKIT_CONSTRAINT'] }}
+    - {{environ['CONDA_CUDA_CONSTRAINT']}}
 
 build:
   string: py{{py}}_cu{{ environ['CU_VERSION'] }}_pyt{{ environ['PYTORCH_VERSION']}}
-  script: python setup.py install --single-version-externally-managed --record=record.txt
+  script: {{environ['PYTHON']}} setup.py install --single-version-externally-managed --record=record.txt
   script_env:
     - BUILD_VERSION
     - CUDA_HOME

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -14,7 +14,7 @@ click == 8.0.4
 protobuf==3.20.1
 
 # Tools for unit tests & coverage.
-pytest == 5.4.1
+pytest == 7.2.0
 pytest-cov == 2.10.0
 pytest-mpi == 0.4
 pytest-timeout == 1.4.2
@@ -27,4 +27,5 @@ hydra-core >= 1.1
 fairscale >= 0.4.5
 
 # Dependency for fused layers, optional
-triton == 2.0.0.dev20220701
+triton == 2.0.0.dev20221105
+networkx
diff --git a/tests/test_unbind.py b/tests/test_unbind.py
@@ -43,7 +43,8 @@ def test_unbind(dim: int, contiguous: bool):
     g = torch.randn_like(loss1)
     loss1.backward(g)
     loss2.backward(g)
-    # type: ignore
+    assert x.grad is not None
+    assert x2.grad is not None
     assert torch.allclose(x.grad, x2.grad)
 
 

diff --git a/xformers/benchmarks/LRA/run_tasks.py b/xformers/benchmarks/LRA/run_tasks.py
@@ -10,7 +10,7 @@
 import os
 from enum import Enum
 from pathlib import Path
-from typing import Dict, Tuple
+from typing import Dict, Tuple, cast
 
 import pytorch_lightning as pl
 import torch
@@ -51,10 +51,11 @@ def build_model(args: argparse.Namespace, config: Dict) -> nn.Module:
     task = args.task
     attention_name = args.attention
 
-    model: pl.LightningModule = (
+    model = cast(
+        pl.LightningModule,
         ModelForSCDual(config[f"{task}"], attention_name)
         if task == Task.Retrieval
-        else ModelForSC(config[f"{task}"], attention_name)
+        else ModelForSC(config[f"{task}"], attention_name),
     )
 
     logging.info(model)

diff --git a/xformers/components/nvfuser/bias_act_dropout.py b/xformers/components/nvfuser/bias_act_dropout.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import functools
 from typing import Optional
 
 import torch
@@ -47,6 +48,16 @@ def __init__(
         self.bias = (
             nn.Parameter(torch.zeros(bias_shape)) if bias_shape is not None else None
         )
+        self._fn_train = functools.partial(
+            _fn,
+            activation=self.pytorch_activation,
+            prob=self.p,
+        )
+        self._fn_eval = functools.partial(
+            _fn,
+            activation=self.pytorch_activation,
+            prob=0.0,
+        )
 
         assert (
             self.p < 1.0
@@ -59,12 +70,12 @@ def init_weights(self, *args, **kwargs):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Train/inference
-        p = self.p if self.training else 0.0
+        fn = self._fn_train if self.training else self._fn_eval
 
         # Catch a non-cuda setup, fallback to pytorch
         if not x.is_cuda:
-            return _fn(x, self.bias, self.pytorch_activation, p)
+            return fn(x, self.bias)
 
         # AOTAutograd, NVFuser backed path
-        aot_fn = memory_efficient_fusion(_fn, static_argnums=(2, 3))
-        return aot_fn(x, self.bias, self.pytorch_activation, p)
+        aot_fn = memory_efficient_fusion(fn)
+        return aot_fn(x, self.bias)
diff --git a/xformers/components/nvfuser/bias_dropout_res.py b/xformers/components/nvfuser/bias_dropout_res.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import functools
 from typing import Optional
 
 import torch
@@ -14,8 +15,8 @@
 def _fn(
     x: torch.Tensor,
     bias: Optional[torch.nn.parameter.Parameter],
-    prob: float,
     residual: torch.Tensor,
+    prob: float,
 ) -> torch.Tensor:
     a = torch.add(x, bias) if bias is not None else x
     b = torch.nn.functional.dropout(a, prob) if prob > 0.0 else a
@@ -41,6 +42,8 @@ def __init__(
         self.bias = (
             nn.Parameter(torch.zeros(bias_shape)) if bias_shape is not None else None
         )
+        self._fn_train = functools.partial(_fn, prob=self.p)
+        self._fn_eval = functools.partial(_fn, prob=0.0)
 
         assert (
             self.p < 1.0
@@ -53,12 +56,12 @@ def init_weights(self, *args, **kwargs):
 
     def forward(self, x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
         # Train/inference
-        p = self.p if self.training else 0.0
+        fn = self._fn_train if self.training else self._fn_eval
 
         # Catch a non-cuda setup, fallback to pytorch
         if not x.is_cuda:
-            return _fn(x, self.bias, p, residual)
+            return fn(x, self.bias, residual)
 
         # AOTAutograd, NVFuser backed path
-        aot_fn = memory_efficient_fusion(fn=_fn, static_argnums=(2))
-        return aot_fn(x, self.bias, p, residual)
+        aot_fn = memory_efficient_fusion(fn)
+        return aot_fn(x, self.bias, residual)
diff --git a/xformers/components/nvfuser/bias_dropout_res_layernorm.py b/xformers/components/nvfuser/bias_dropout_res_layernorm.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import functools
 from typing import Optional
 
 import torch
@@ -16,10 +17,10 @@
 def _fn(
     x: torch.Tensor,
     bias: Optional[torch.nn.parameter.Parameter],
+    residual: torch.Tensor,
     prob: float,
     layer_norm_style: Optional[ResidualNormStyle],
     norm: nn.Module,
-    residual: torch.Tensor,
 ) -> torch.Tensor:
     a = torch.add(x, bias) if bias is not None else x
     b = torch.nn.functional.dropout(a, prob) if prob > 0.0 else a
@@ -57,6 +58,18 @@ def __init__(
             nn.Parameter(torch.zeros(bias_shape)) if bias_shape is not None else None
         )
         self.norm = nn.LayerNorm(d_model)
+        self._fn_train = functools.partial(
+            _fn,
+            prob=p,
+            layer_norm_style=self.layer_norm_style,
+            norm=self.norm,
+        )
+        self._fn_eval = functools.partial(
+            _fn,
+            prob=0.0,
+            layer_norm_style=self.layer_norm_style,
+            norm=self.norm,
+        )
 
         assert (
             self.p < 1.0
@@ -69,12 +82,12 @@ def init_weights(self, *args, **kwargs):
 
     def forward(self, x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
         # Train/inference
-        p = self.p if self.training else 0.0
+        fn = self._fn_train if self.training else self._fn_eval
 
         # Catch a non-cuda setup, fallback to pytorch
         if not x.is_cuda:
-            return _fn(x, self.bias, p, self.layer_norm_style, self.norm, residual)
+            return fn(x, self.bias, residual)
 
         # AOTAutograd, NVFuser backed path
-        aot_fn = memory_efficient_fusion(fn=_fn, static_argnums=(2, 3, 4))
-        return aot_fn(x, self.bias, p, self.layer_norm_style, self.norm, residual)
+        aot_fn = memory_efficient_fusion(fn=fn)
+        return aot_fn(x, self.bias, residual)