drivendataorg · pjbull · Mar 29, 2023 · Mar 27, 2023 · Mar 27, 2023 · Mar 27, 2023
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -42,7 +42,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest]
-        python-version: [3.7, 3.8]
+        python-version: [3.8, 3.9]
 
     steps:
       - uses: actions/checkout@v2

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ Visit https://zamba.drivendata.org/docs/ for full documentation and tutorials.
 
 First, make sure you have the prerequisites installed:
 
-* Python 3.7 or 3.8
+* Python 3.8 or 3.9
 * FFmpeg > 4.3
 
 Then run:

diff --git a/docs/docs/index.md b/docs/docs/index.md
@@ -29,7 +29,7 @@ We encourage people to share their custom models trained with Zamba. If you trai
 
 First, make sure you have the prerequisites installed:
 
-* Python 3.7 or 3.8
+* Python 3.8 or 3.9
 * FFmpeg > 4.3
 
 Then run:

diff --git a/docs/docs/install.md b/docs/docs/install.md
@@ -9,10 +9,10 @@ GPU configurations.
 
 Prerequisites:
 
- - Python 3.7 or 3.8
+ - Python 3.8 or 3.9
  - FFmpeg
 
-#### [Python](https://www.python.org/) 3.7 or 3.8
+#### [Python](https://www.python.org/) 3.8 or 3.9
 
 We recommend [Python installation using Anaconda](https://www.anaconda.com/download/) for all platforms. For more information about how to install Anaconda, here are some useful YouTube videos of installation:
 

diff --git a/docs/docs/models/depth.md b/docs/docs/models/depth.md
@@ -23,7 +23,7 @@ The output of the depth estimation model is a csv with the following columns:
 - `time`: seconds from the start of the video
 - `distance`: distance between detected animal and the camera
 
-There will be multiple rows per timestamp if there are multiple animals detected in the frame. If there is no animal in the frame, the distance will be null.
+There will be multiple rows per timestamp if there are multiple animals detected in the frame. Due to current limitations of the algorithm, the distance for all animals in the frame will be the same. If there is no animal in the frame, the distance will be null.
 
 For example, the first few rows of the `depth_predictions.csv` might look like this:
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
   "License :: OSI Approved :: MIT License",
   "Operating System :: OS Independent",
 ]
-requires-python = ">=3.7"
+requires-python = ">=3.8"
 dependencies = [
   "appdirs",
   "av",
@@ -25,7 +25,6 @@ dependencies = [
   "future",
   "fvcore",
   "gitpython",
-  "importlib_metadata ; python_version < '3.8'",
   "loguru",
   "numpy",
   "opencv-python-headless",
@@ -35,7 +34,7 @@ dependencies = [
   "pqdm",
   "pydantic",
   "python-dotenv",
-  "pytorch-lightning>=1.6.0",
+  "pytorch-lightning>=2.0.0",
   "pytorchvideo",
   "scikit-learn",
   "tensorboard",

diff --git a/zamba/models/model_manager.py b/zamba/models/model_manager.py
@@ -13,6 +13,7 @@
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from pytorch_lightning.strategies import DDPStrategy
+from pytorch_lightning.tuner import Tuner
 
 from zamba.data.video import VideoLoaderConfig
 from zamba.models.config import (
@@ -24,7 +25,11 @@
     PredictConfig,
 )
 from zamba.models.registry import available_models
-from zamba.models.utils import get_checkpoint_hparams, get_default_hparams
+from zamba.models.utils import (
+    configure_accelerator_and_devices_from_gpus,
+    get_checkpoint_hparams,
+    get_default_hparams,
+)
 from zamba.pytorch.finetuning import BackboneFinetuning
 from zamba.pytorch_lightning.utils import ZambaDataModule, ZambaVideoClassificationLightningModule
 
@@ -274,16 +279,18 @@ def train_model(
     if train_config.backbone_finetune_config is not None:
         callbacks.append(BackboneFinetuning(**train_config.backbone_finetune_config.dict()))
 
+    accelerator, devices = configure_accelerator_and_devices_from_gpus(train_config.gpus)
+
     trainer = pl.Trainer(
-        gpus=train_config.gpus,
+        accelerator=accelerator,
+        devices=devices,
         max_epochs=train_config.max_epochs,
-        auto_lr_find=train_config.auto_lr_find,
         logger=tensorboard_logger,
         callbacks=callbacks,
         fast_dev_run=train_config.dry_run,
         strategy=DDPStrategy(find_unused_parameters=False)
-        if data_module.multiprocessing_context is not None
-        else None,
+        if (data_module.multiprocessing_context is not None) and (train_config.gpus > 1)
+        else "auto",
     )
 
     if video_loader_config.cache_dir is None:
@@ -293,7 +300,8 @@ def train_model(
 
     if train_config.auto_lr_find:
         logger.info("Finding best learning rate.")
-        trainer.tune(model, data_module)
+        tuner = Tuner(trainer)
+        tuner.lr_find(model=model, datamodule=data_module)
 
     try:
         git_hash = git.Repo(search_parent_directories=True).head.object.hexsha
@@ -377,8 +385,13 @@ def predict_model(
     else:
         logger.info(f"Videos will be cached to {video_loader_config.cache_dir}.")
 
+    accelerator, devices = configure_accelerator_and_devices_from_gpus(predict_config.gpus)
+
     trainer = pl.Trainer(
-        gpus=predict_config.gpus, logger=False, fast_dev_run=predict_config.dry_run
+        accelerator=accelerator,
+        devices=devices,
+        logger=False,
+        fast_dev_run=predict_config.dry_run,
     )
 
     configuration = {

diff --git a/zamba/models/utils.py b/zamba/models/utils.py
@@ -75,3 +75,14 @@ def get_model_species(checkpoint, model_name):
     else:
         model_species = get_default_hparams(model_name)["species"]
     return model_species
+
+
+def configure_accelerator_and_devices_from_gpus(gpus):
+    """Derive accelerator and number of devices for pl.Trainer from user-specified number of gpus."""
+    if gpus > 0:
+        accelerator = "gpu"
+        devices = gpus
+    else:
+        accelerator = "cpu"
+        devices = "auto"
+    return accelerator, devices
diff --git a/zamba/pytorch_lightning/utils.py b/zamba/pytorch_lightning/utils.py
@@ -152,6 +152,10 @@ def __init__(
         self.save_hyperparameters("lr", "scheduler", "scheduler_params", "species")
         self.hparams["model_class"] = self.model_class
 
+        self.training_step_outputs = []
+        self.validation_step_outputs = []
+        self.test_step_outputs = []
+
     def forward(self, x):
         return self.model(x)
 
@@ -173,9 +177,10 @@ def training_step(self, batch, batch_idx):
         y_hat = self(x)
         loss = F.binary_cross_entropy_with_logits(y_hat, y)
         self.log("train_loss", loss.detach())
+        self.training_step_outputs.append(loss)
         return loss
 
-    def validation_step(self, batch, batch_idx):
+    def _val_step(self, batch, batch_idx):
         x, y = batch
         y_hat = self(x)
         loss = F.binary_cross_entropy_with_logits(y_hat, y)
@@ -188,6 +193,16 @@ def validation_step(self, batch, batch_idx):
             "y_proba": y_proba,
         }
 
+    def validation_step(self, batch, batch_idx):
+        output = self._val_step(batch, batch_idx)
+        self.validation_step_outputs.append(output)
+        return output
+
+    def test_step(self, batch, batch_idx):
+        output = self._val_step(batch, batch_idx)
+        self.test_step_outputs.append(output)
+        return output
+
     @staticmethod
     def aggregate_step_outputs(
         outputs: Dict[str, np.ndarray]
@@ -229,23 +244,22 @@ def compute_and_log_metrics(
         ):
             self.log(f"species/{subset}_{metric_name}/{label}", metric)
 
-    def validation_epoch_end(self, outputs: List[Dict[str, np.ndarray]]):
+    def on_validation_epoch_end(self):
         """Aggregates validation_step outputs to compute and log the validation macro F1 and top K
         metrics.
 
         Args:
             outputs (List[dict]): list of output dictionaries from each validation step
                 containing y_pred and y_true.
         """
-        y_true, y_pred, y_proba = self.aggregate_step_outputs(outputs)
+        y_true, y_pred, y_proba = self.aggregate_step_outputs(self.validation_step_outputs)
         self.compute_and_log_metrics(y_true, y_pred, y_proba, subset="val")
+        self.validation_step_outputs.clear()  # free memory
 
-    def test_step(self, batch, batch_idx):
-        return self.validation_step(batch, batch_idx)
-
-    def test_epoch_end(self, outputs: List[Dict[str, np.ndarray]]):
-        y_true, y_pred, y_proba = self.aggregate_step_outputs(outputs)
+    def on_test_epoch_end(self):
+        y_true, y_pred, y_proba = self.aggregate_step_outputs(self.test_step_outputs)
         self.compute_and_log_metrics(y_true, y_pred, y_proba, subset="test")
+        self.test_step_outputs.clear()  # free memory
 
     def predict_step(self, batch, batch_idx, dataloader_idx: Optional[int] = None):
         x, y = batch