NVIDIA · pstjohn · Sep 3, 2024 · Aug 23, 2024 · Aug 26, 2024 · Aug 27, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -14,6 +14,8 @@
 	"mounts": [
 		// Mount the local ~/.aws config to pass along AWS credentials for PBSS.
 		"source=${localEnv:HOME}/.aws,target=/home/bionemo/.aws,type=bind,consistency=cached",
+		"source=${localEnv:HOME}/.ngc,target=/home/bionemo/.ngc,type=bind,consistency=cached",
+		"source=${localEnv:HOME}/.cache,target=/home/bionemo/.cache,type=bind,consistency=cached",
 		"source=${localEnv:HOME}/.ssh,target=/home/bionemo/.ssh,readonly,type=bind,consistency=cached"
 	],
 	"containerEnv": {

diff --git a/.gitignore b/.gitignore
@@ -7,10 +7,7 @@ docs/site/
 .gnupg
 
 # Lightning and project output files
-/test_data
 lightning_logs
-/models
-/data
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,14 +1,23 @@
 {
     "cSpell.words": [
+        "adata",
         "allclose",
         "bionemo",
+        "boto",
+        "botocore",
+        "docstrings",
         "dtype",
         "NBVAL",
         "nemo",
+        "ngcsdk",
+        "pbss",
+        "platformdirs",
         "pretraining",
+        "pydantic",
         "rampup",
         "resamplers",
         "singlecell",
+        "tqdm",
         "uniref"
     ],
     "editor.rulers": [

diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
diff --git a/3rdparty/NeMo b/3rdparty/NeMo
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -9,6 +9,7 @@ theme:
     - content.code.annotate
     - navigation.tabs
     - navigation.sections
+    - navigation.indexes
     - navigation.instant
     - navigation.instant.prefetch
     - search.suggest
@@ -54,10 +55,14 @@ plugins:
   - mkdocs-jupyter
 
 markdown_extensions:
+  - pymdownx.details
+  - pymdownx.superfences
   - pymdownx.snippets:
       check_paths: true
       restrict_base_path: False
       base_path: "."
+  - def_list
+  - admonition
 
 copyright: |
   &copy; Copyright 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/docs/scripts/gen_ref_pages.py b/docs/scripts/gen_ref_pages.py
@@ -33,12 +33,21 @@
         parts = tuple(module_path.parts)
 
         if parts[-1] == "__init__":
-            parts = parts[:-1]
+            # parts = parts[:-1]
+            continue  # Don't generate ref pages for __init__.py
         elif parts[-1] == "__main__":
-            continue
+            continue  # Don't generate ref pages for __main__.py
 
         with mkdocs_gen_files.open(full_doc_path, "w") as fd:
             identifier = ".".join(parts)
             print("::: " + identifier, file=fd)
 
         mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root))
+
+    for path in sorted(src.rglob("*.md")):
+        doc_path = path.relative_to(src)
+        full_doc_path = Path("API_reference", doc_path)
+        with mkdocs_gen_files.open(full_doc_path, "w") as fd:
+            fd.write(path.read_text())
+        print(full_doc_path)
+        mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root))
diff --git a/scripts/download_artifacts.py b/scripts/download_artifacts.py
@@ -378,4 +378,5 @@ def main(models: Optional[List[str]], data: Optional[List[str]]) -> None:
     if not (args.models or args.data):
         logging.warning("No models or data were selected to download.")
     else:
+        logging.warning("This script is deprecated, use `bionemo.testing.data.load` instead.")
         main(models=args.models, data=args.data)
diff --git a/scripts/singlecell/geneformer/test_pretrain.py b/scripts/singlecell/geneformer/test_pretrain.py
@@ -27,6 +27,7 @@
 from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.utils.datamodule_utils import parse_kwargs_to_arglist
 from bionemo.testing import megatron_parallel_state_utils
+from bionemo.testing.data.load import load
 
 
 # TODO(@jstjohn) use fixtures for pulling down data and checkpoints
@@ -39,7 +40,8 @@
     # From here, we want to get to the root of the repository: _before_ sub-packages/
     .parent.parent
 ).absolute()
-data_path: Path = bionemo2_root / "test_data/cellxgene_2023-12-15_small/processed_data"
+assert bionemo2_root != Path("/")
+data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
 
 
 def test_bionemo2_rootdir():

diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py
@@ -36,6 +36,7 @@
 from bionemo.llm.model.biobert.model import MegatronBioBertModel
 from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping
 from bionemo.testing import megatron_parallel_state_utils
+from bionemo.testing.data.load import load
 
 
 bionemo2_root: Path = (
@@ -47,7 +48,7 @@
     .parent.parent
 ).absolute()
 assert bionemo2_root != Path("/")
-nemo1_checkpoint_path: Path = bionemo2_root / "models/protein/esm2nv/esm2nv_650M_converted.nemo"
+nemo1_checkpoint_path: Path = load("esm2/nv_650m:1.0")
 
 
 def reduce_hiddens(hiddens: Tensor, attention_mask: Tensor) -> Tensor:

diff --git a/sub-packages/bionemo-example_model/tests/bionemo/example_model/test_lightning_basic.py b/sub-packages/bionemo-example_model/tests/bionemo/example_model/test_lightning_basic.py
@@ -19,6 +19,7 @@
 
 from bionemo.example_model import lightning_basic as lb
 from bionemo.testing import megatron_parallel_state_utils
+from bionemo.testing.data.load import BIONEMO_CACHE_DIR
 
 
 @pytest.mark.needs_gpu
@@ -33,5 +34,5 @@ def test_train_mnist_litautoencoder_with_megatron_strategy_single_gpu():
             enable_nemo_ckpt_io=False,
         )
         trainer = nl.Trainer(accelerator="gpu", devices=1, strategy=strategy, max_steps=10, num_nodes=1)
-        data_module = lb.MNISTDataModule()
+        data_module = lb.MNISTDataModule(str(BIONEMO_CACHE_DIR))
         trainer.fit(model, data_module)
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py
@@ -40,6 +40,7 @@
 from bionemo.llm.model.biobert.model import BiobertSpecOption
 from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping
 from bionemo.testing import megatron_parallel_state_utils
+from bionemo.testing.data.load import load
 from bionemo.testing.utils import assert_matrix_correlation_above_value, assert_matrix_mape_below_value
 
 
@@ -54,11 +55,11 @@
     .parent.parent
 ).absolute()
 assert bionemo2_root != Path("/")
-nemo1_checkpoint_path: Path = bionemo2_root / "models/singlecell/geneformer/geneformer-qa.nemo"
-nemo1_release_checkpoint_path: Path = bionemo2_root / "models/singlecell/geneformer/geneformer-10M-240530.nemo"
-nemo_1_per_layer_outputs_path: Path = bionemo2_root / "test_data/nemo1-test-outputs-geneformer-qa.pt"
-nemo_1_expected_values_path: Path = bionemo2_root / "test_data/nemo1_geneformer_qa_test_golden_values.pt"
-data_path: Path = bionemo2_root / "test_data/cellxgene_2023-12-15_small/processed_data"
+nemo1_checkpoint_path: Path = load("geneformer/qa")
+nemo1_release_checkpoint_path: Path = load("geneformer/10M_240530")
+nemo_1_per_layer_outputs_path: Path = load("single_cell/nemo1-geneformer-per-layer-outputs")
+nemo_1_expected_values_path: Path = load("single_cell/nemo1-geneformer-golden-vals")
+data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
 
 
 CELLS_FOR_TEST: List[List[str]] = [

diff --git a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
@@ -18,6 +18,8 @@
 
 import pytest
 
+from bionemo.testing.data.load import load
+
 
 @pytest.fixture
 def test_directory() -> Path:
@@ -26,6 +28,4 @@ def test_directory() -> Path:
     Returns:
         A Path object that is the directory with test data.
     """
-    current_file = Path(__file__).resolve()
-    target_path = current_file.parents[5].absolute() / "test_data/scdl_data"
-    return target_path
+    return load("scdl/sample") / "scdl_data"
diff --git a/sub-packages/bionemo-testing/requirements.txt b/sub-packages/bionemo-testing/requirements.txt
@@ -1 +1,9 @@
+boto3
+email-validator
+platformdirs
+pooch
+pydantic>=2.7.0
 pytest
+pyyaml
+ngcsdk
+tqdm
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/README.md b/sub-packages/bionemo-testing/src/bionemo/testing/data/README.md
@@ -0,0 +1,75 @@
+# BioNeMo test data management
+
+This library manages the downloading and caching of large or binary data files used in the documentation or test suite.
+These files should not be committed directly to the repo, and instead should be loaded at test-time when they are
+needed.
+
+We currently support two locations for test data or saved models:
+
+SwiftStack
+
+:   SwiftStack or `pbss` is an NVIDIA-internal, s3-compatible object store that allows for very large data and fast,
+    parallel read/writes. Most critically, `pbss` can be uploaded to without legal approvals for dataset redistribution.
+    These files will not be accessible by external collaborators.
+
+[NGC](https://catalog.ngc.nvidia.com/)
+
+:   NGC hosts containers, models, and resources, some of which require authentication and others that are generally
+    available. This library uses the model and resource types to save test data and reference model weights. These items
+    are accessible by external collaborators, but require legal approval before re-distributing test data.
+
+
+## Loading test or example data
+
+Test data are specified via yaml files in `sub-packages/bionemo-testing/src/bionemo/testing/data/resources`. As an
+example, in `esm2.yaml`:
+
+```yaml
+- tag: nv_650m:1.0
+  ngc: "nvidia/clara/esm2nv650m:1.0"
+  ngc_registry: model
+  pbss: "s3://bionemo-ci/models/esm2nv_650M_converted.nemo"
+  sha256: 1e38063cafa808306329428dd17ea6df78c9e5d6b3d2caf04237c555a1f131b7
+  owner: Farhad Ramezanghorbani <[email protected]>
+  description: >
+    A pretrained 650M parameter ESM2 model.
+    See https://ngc.nvidia.com/catalog/models/nvidia:clara:esm2nv650m.
+```
+
+To load these model weights during a test, use the [load][bionemo.testing.data.load.load] function with the filename and
+tag of the desired asset, which returns a path a the specified file:
+
+```python
+path_to_my_checkpoint = load("esm2/nv_650m:1.0")
+config = ESM2Config(nemo1_ckpt_path=path_to_my_checkpoint)
+```
+
+If this function is called without the data available on the local machine, it will be fetched from the default source
+(currently `pbss`.) Otherwise, it will return the cached directory. To download with NGC, pass `source="ngc"` to
+[load][bionemo.testing.data.load.load].
+
+## File unpacking and/or decompression
+
+All test artifacts are individual files. If a zip or tar archive is specified, it will be unpacked automatically, and
+the path to the directory will be returned via [load][bionemo.testing.data.load.load]. Compressed files ('gzip', 'bz2',
+or 'xz') are automatically decompressed before they are returned. The file's compression and/or archive format is
+determined based on the filename specified in the `pbss` URL.
+
+!!! note "Files in NGC resources"
+
+    NGC resources are folders, i.e., they may contain multiple files per resource.
+    [load][bionemo.testing.data.load.load] will _only_ download the filename matching the stem of the `pbss` url. The
+    same NGC resource can therefore be used to host multiple test assets that are used independently.
+
+
+## Adding new test assets
+
+To add new data, first ensure that the data is available from either NGC or `pbss`. Next, extend or create a new yaml
+file in `sub-packages/bionemo-testing/src/bionemo/testing/data/resources` with the required information. Owner emails
+must be provided for all assets. The description and `ngc` fields are currently optional. If the `sha256` is left
+unspecified, `pooch` will report the downloaded file's sha when loaded.
+
+!!! warning
+
+    SHAs should be provided for all files to ensure the download completes correctly, and to invalidate caches if the
+    files change.
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/__init__.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+1 −1		.flake8
+1 −2		.gitlab/stages/01.tests.yml
+2 −5		.pylintrc
+1 −2		Dockerfile.ci
+9 −36		megatron/core/models/gpt/gpt_layer_specs.py
+24 −41		megatron/core/models/gpt/gpt_model.py
+4 −14		megatron/core/parallel_state.py
+57 −97		megatron/core/tensor_parallel/layers.py
+6 −15		megatron/core/tensor_parallel/mappings.py
+21 −15		megatron/core/transformer/attention.py
+11 −14		megatron/core/transformer/custom_layers/transformer_engine.py
+4 −6		megatron/core/transformer/dot_product_attention.py
+1 −2		megatron/core/transformer/module.py
+11 −23		megatron/core/transformer/moe/token_dispatcher.py
+3 −0		megatron/core/transformer/spec_utils.py
+42 −25		megatron/core/transformer/transformer_block.py
+30 −48		megatron/core/transformer/transformer_config.py
+10 −9		megatron/core/transformer/transformer_layer.py
+9 −13		megatron/core/utils.py
+134 −155		megatron/legacy/model/language_model.py
+17 −34		megatron/legacy/model/transformer.py
+3 −4		megatron/training/training.py
+3 −3		tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+5 −2		tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+1 −8		tests/functional_tests/shell_test_utils/_run_training.sh
+0 −2		tests/functional_tests/shell_test_utils/run_ci_test.sh
+3 −0		tests/unit_tests/dist_checkpointing/conftest.py
+0 −1		tests/unit_tests/dist_checkpointing/test_optimizer.py
+2 −2		tests/unit_tests/test_utilities.py
+1 −9		tools/autoformat.sh
+41 −58		.github/workflows/cicd-main.yml
+1 −1		Dockerfile.ci
+1 −1		docs/source/multimodal/text2img/sd.rst
+1 −2		docs/source/nlp/nemo_megatron/intro.rst
+0 −62		docs/source/nlp/nemo_megatron/rampup_batch_size.rst
+0 −4		examples/audio/audio_to_audio_train.py
+0 −164		examples/audio/conf/flow_matching_generative.yaml
+0 −167		examples/audio/conf/flow_matching_generative_finetuning.yaml
+0 −171		examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml
+0 −190		examples/llm/run/llama3_pretraining.py
+0 −83		examples/multimodal/speech_llm/export/README.md
+0 −16		examples/multimodal/speech_llm/export/conf/salm_export.yaml
+0 −39		examples/multimodal/speech_llm/export/export_salm.py
+0 −204		examples/multimodal/speech_llm/export/extract_salm_weights.py
+6 −17		examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py
+2 −4		examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
+0 −193		examples/tts/conf/audio_codec/audio_codec_22050.yaml
+0 −193		examples/tts/conf/audio_codec/audio_codec_44100.yaml
+6 −16		nemo/collections/audio/data/audio_to_audio_lhotse.py
+1 −32		nemo/collections/audio/models/audio_to_audio.py
+0 −269		nemo/collections/audio/models/enhancement.py
+0 −106		nemo/collections/audio/modules/ssl_pretrain_masking.py
+0 −252		nemo/collections/audio/parts/submodules/flow.py
+0 −507		nemo/collections/audio/parts/submodules/transformerunet.py
+0 −177		nemo/collections/audio/parts/utils/callbacks.py
+15 −40		nemo/collections/common/data/lhotse/nemo_adapters.py
+4 −12		nemo/collections/common/parts/preprocessing/cleaners.py
+0 −54		nemo/collections/common/parts/utils.py
+3 −4		nemo/collections/common/tokenizers/en_ja_tokenizers.py
+2 −2		nemo/collections/common/tokenizers/indic_tokenizers.py
+2 −2		nemo/collections/common/tokenizers/moses_tokenizers.py
+0 −43		nemo/collections/llm/__init__.py
+17 −11		nemo/collections/llm/api.py
+0 −6		nemo/collections/llm/fn/activation.py
+0 −52		nemo/collections/llm/gpt/model/__init__.py
+1 −11		nemo/collections/llm/gpt/model/base.py
+3 −88		nemo/collections/llm/gpt/model/llama.py
+0 −345		nemo/collections/llm/gpt/model/nemotron.py
+0 −392		nemo/collections/llm/gpt/model/qwen2.py
+0 −206		nemo/collections/llm/gpt/model/starcoder.py
+0 −383		nemo/collections/llm/gpt/model/starcoder2.py
+2 −7		nemo/collections/llm/recipes/log/default.py
+0 −7		nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+2 −2		nemo/collections/nlp/modules/common/megatron/utils.py
+33 −40		nemo/collections/nlp/modules/common/tokenizer_utils.py
+5 −308		nemo/collections/tts/modules/audio_codec_modules.py
+2 −27		nemo/core/optim/mcore_optim.py
+2 −10		nemo/deploy/multimodal/query_multimodal.py
+2 −118		nemo/export/multimodal/build.py
+6 −285		nemo/export/multimodal/run.py
+4 −10		nemo/export/tensorrt_llm.py
+15 −43		nemo/export/tensorrt_mm_exporter.py
+16 −37		nemo/export/trt_llm/converter/model_converter.py
+6 −35		nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
+117 −211		nemo/export/trt_llm/converter/utils.py
+16 −119		nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+2 −0		nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+4 −0		nemo/export/trt_llm/tensorrt_llm_build.py
+0 −9		nemo/export/trt_llm/tensorrt_llm_run.py
+1 −1		nemo/lightning/__init__.py
+49 −1		nemo/lightning/io/api.py
+1 −1		nemo/lightning/io/artifact/base.py
+7 −8		nemo/lightning/io/artifact/file.py
+4 −4		nemo/lightning/io/artifact/pickle.py
+1 −2		nemo/lightning/io/connector.py
+21 −91		nemo/lightning/io/mixin.py
+7 −16		nemo/lightning/io/pl.py
+1 −0		nemo/lightning/io/state.py
+18 −13		nemo/lightning/nemo_logger.py
+1 −6		nemo/lightning/pytorch/callbacks/__init__.py
+0 −74		nemo/lightning/pytorch/callbacks/ddp_parity_checker.py
+0 −68		nemo/lightning/pytorch/callbacks/garbage_collection.py
+0 −78		nemo/lightning/pytorch/callbacks/memory_profiler.py
+83 −24		nemo/lightning/pytorch/strategies.py
+0 −8		nemo/lightning/pytorch/strategies/__init__.py
+0 −245		nemo/lightning/pytorch/strategies/fsdp_strategy.py
+0 −308		nemo/lightning/pytorch/strategies/utils.py
+0 −18		nemo/lightning/resume.py
+0 −0		nemo/lightning/run/__init__.py
+0 −165		nemo/lightning/run/plugins.py
+0 −18		nemo/utils/exp_manager.py
+1 −1		requirements/requirements_lightning.txt
+3 −12		scripts/deploy/multimodal/deploy_triton.py
+2 −3		scripts/deploy/nlp/deploy_triton.py
+0 −139		scripts/export/export_mm_to_trtllm.py
+0 −36		scripts/export/export_to_trt_llm.py
+4 −9		scripts/nlp_language_modeling/niv2/preprocess_niv2.py
+2 −5		scripts/nlp_language_modeling/t0/t0_dataset_preproc.py
+0 −188		tests/collections/common/test_lhotse_nemo_adapters.py
+0 −611		tests/collections/llm/test_mnist_model_nemo2_fsdp.py
+0 −47		tests/core/test_exp_manager.py
+3 −36		tests/export/nemo_export.py
+0 −129		tests/lightning/test_ddp_parity_checker.py
+3 −33		tests/lightning/test_nemo_logger.py
+0 −253		tutorials/multimodal/SDXL Tutorial.ipynb
+0 −478		tutorials/tts/Audio_Codec_Inference.ipynb