Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add module for loading test data. #120

Merged
merged 5 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
"mounts": [
// Mount the local ~/.aws config to pass along AWS credentials for PBSS.
"source=${localEnv:HOME}/.aws,target=/home/bionemo/.aws,type=bind,consistency=cached",
"source=${localEnv:HOME}/.ngc,target=/home/bionemo/.ngc,type=bind,consistency=cached",
"source=${localEnv:HOME}/.cache,target=/home/bionemo/.cache,type=bind,consistency=cached",
pstjohn marked this conversation as resolved.
Show resolved Hide resolved
"source=${localEnv:HOME}/.ssh,target=/home/bionemo/.ssh,readonly,type=bind,consistency=cached"
],
"containerEnv": {
Expand Down
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ docs/site/
.gnupg

# Lightning and project output files
/test_data
lightning_logs
/models
/data

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
9 changes: 9 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
{
"cSpell.words": [
"adata",
"allclose",
"bionemo",
"boto",
"botocore",
"docstrings",
"dtype",
"NBVAL",
"nemo",
"ngcsdk",
"pbss",
"platformdirs",
"pretraining",
"pydantic",
"rampup",
"resamplers",
"singlecell",
"tqdm",
"uniref"
],
"editor.rulers": [
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-LM
Submodule Megatron-LM updated 30 files
+1 −1 .flake8
+1 −2 .gitlab/stages/01.tests.yml
+2 −5 .pylintrc
+1 −2 Dockerfile.ci
+9 −36 megatron/core/models/gpt/gpt_layer_specs.py
+24 −41 megatron/core/models/gpt/gpt_model.py
+4 −14 megatron/core/parallel_state.py
+57 −97 megatron/core/tensor_parallel/layers.py
+6 −15 megatron/core/tensor_parallel/mappings.py
+21 −15 megatron/core/transformer/attention.py
+11 −14 megatron/core/transformer/custom_layers/transformer_engine.py
+4 −6 megatron/core/transformer/dot_product_attention.py
+1 −2 megatron/core/transformer/module.py
+11 −23 megatron/core/transformer/moe/token_dispatcher.py
+3 −0 megatron/core/transformer/spec_utils.py
+42 −25 megatron/core/transformer/transformer_block.py
+30 −48 megatron/core/transformer/transformer_config.py
+10 −9 megatron/core/transformer/transformer_layer.py
+9 −13 megatron/core/utils.py
+134 −155 megatron/legacy/model/language_model.py
+17 −34 megatron/legacy/model/transformer.py
+3 −4 megatron/training/training.py
+3 −3 tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+5 −2 tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+1 −8 tests/functional_tests/shell_test_utils/_run_training.sh
+0 −2 tests/functional_tests/shell_test_utils/run_ci_test.sh
+3 −0 tests/unit_tests/dist_checkpointing/conftest.py
+0 −1 tests/unit_tests/dist_checkpointing/test_optimizer.py
+2 −2 tests/unit_tests/test_utilities.py
+1 −9 tools/autoformat.sh
2 changes: 1 addition & 1 deletion 3rdparty/NeMo
Submodule NeMo updated 96 files
+41 −58 .github/workflows/cicd-main.yml
+1 −1 Dockerfile.ci
+1 −1 docs/source/multimodal/text2img/sd.rst
+1 −2 docs/source/nlp/nemo_megatron/intro.rst
+0 −62 docs/source/nlp/nemo_megatron/rampup_batch_size.rst
+0 −4 examples/audio/audio_to_audio_train.py
+0 −164 examples/audio/conf/flow_matching_generative.yaml
+0 −167 examples/audio/conf/flow_matching_generative_finetuning.yaml
+0 −171 examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml
+0 −190 examples/llm/run/llama3_pretraining.py
+0 −83 examples/multimodal/speech_llm/export/README.md
+0 −16 examples/multimodal/speech_llm/export/conf/salm_export.yaml
+0 −39 examples/multimodal/speech_llm/export/export_salm.py
+0 −204 examples/multimodal/speech_llm/export/extract_salm_weights.py
+6 −17 examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py
+2 −4 examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
+0 −193 examples/tts/conf/audio_codec/audio_codec_22050.yaml
+0 −193 examples/tts/conf/audio_codec/audio_codec_44100.yaml
+6 −16 nemo/collections/audio/data/audio_to_audio_lhotse.py
+1 −32 nemo/collections/audio/models/audio_to_audio.py
+0 −269 nemo/collections/audio/models/enhancement.py
+0 −106 nemo/collections/audio/modules/ssl_pretrain_masking.py
+0 −252 nemo/collections/audio/parts/submodules/flow.py
+0 −507 nemo/collections/audio/parts/submodules/transformerunet.py
+0 −177 nemo/collections/audio/parts/utils/callbacks.py
+15 −40 nemo/collections/common/data/lhotse/nemo_adapters.py
+4 −12 nemo/collections/common/parts/preprocessing/cleaners.py
+0 −54 nemo/collections/common/parts/utils.py
+3 −4 nemo/collections/common/tokenizers/en_ja_tokenizers.py
+2 −2 nemo/collections/common/tokenizers/indic_tokenizers.py
+2 −2 nemo/collections/common/tokenizers/moses_tokenizers.py
+0 −43 nemo/collections/llm/__init__.py
+17 −11 nemo/collections/llm/api.py
+0 −6 nemo/collections/llm/fn/activation.py
+0 −52 nemo/collections/llm/gpt/model/__init__.py
+1 −11 nemo/collections/llm/gpt/model/base.py
+3 −88 nemo/collections/llm/gpt/model/llama.py
+0 −345 nemo/collections/llm/gpt/model/nemotron.py
+0 −392 nemo/collections/llm/gpt/model/qwen2.py
+0 −206 nemo/collections/llm/gpt/model/starcoder.py
+0 −383 nemo/collections/llm/gpt/model/starcoder2.py
+2 −7 nemo/collections/llm/recipes/log/default.py
+0 −7 nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+2 −2 nemo/collections/nlp/modules/common/megatron/utils.py
+33 −40 nemo/collections/nlp/modules/common/tokenizer_utils.py
+5 −308 nemo/collections/tts/modules/audio_codec_modules.py
+2 −27 nemo/core/optim/mcore_optim.py
+2 −10 nemo/deploy/multimodal/query_multimodal.py
+2 −118 nemo/export/multimodal/build.py
+6 −285 nemo/export/multimodal/run.py
+4 −10 nemo/export/tensorrt_llm.py
+15 −43 nemo/export/tensorrt_mm_exporter.py
+16 −37 nemo/export/trt_llm/converter/model_converter.py
+6 −35 nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
+117 −211 nemo/export/trt_llm/converter/utils.py
+16 −119 nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+2 −0 nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+4 −0 nemo/export/trt_llm/tensorrt_llm_build.py
+0 −9 nemo/export/trt_llm/tensorrt_llm_run.py
+1 −1 nemo/lightning/__init__.py
+49 −1 nemo/lightning/io/api.py
+1 −1 nemo/lightning/io/artifact/base.py
+7 −8 nemo/lightning/io/artifact/file.py
+4 −4 nemo/lightning/io/artifact/pickle.py
+1 −2 nemo/lightning/io/connector.py
+21 −91 nemo/lightning/io/mixin.py
+7 −16 nemo/lightning/io/pl.py
+1 −0 nemo/lightning/io/state.py
+18 −13 nemo/lightning/nemo_logger.py
+1 −6 nemo/lightning/pytorch/callbacks/__init__.py
+0 −74 nemo/lightning/pytorch/callbacks/ddp_parity_checker.py
+0 −68 nemo/lightning/pytorch/callbacks/garbage_collection.py
+0 −78 nemo/lightning/pytorch/callbacks/memory_profiler.py
+83 −24 nemo/lightning/pytorch/strategies.py
+0 −8 nemo/lightning/pytorch/strategies/__init__.py
+0 −245 nemo/lightning/pytorch/strategies/fsdp_strategy.py
+0 −308 nemo/lightning/pytorch/strategies/utils.py
+0 −18 nemo/lightning/resume.py
+0 −0 nemo/lightning/run/__init__.py
+0 −165 nemo/lightning/run/plugins.py
+0 −18 nemo/utils/exp_manager.py
+1 −1 requirements/requirements_lightning.txt
+3 −12 scripts/deploy/multimodal/deploy_triton.py
+2 −3 scripts/deploy/nlp/deploy_triton.py
+0 −139 scripts/export/export_mm_to_trtllm.py
+0 −36 scripts/export/export_to_trt_llm.py
+4 −9 scripts/nlp_language_modeling/niv2/preprocess_niv2.py
+2 −5 scripts/nlp_language_modeling/t0/t0_dataset_preproc.py
+0 −188 tests/collections/common/test_lhotse_nemo_adapters.py
+0 −611 tests/collections/llm/test_mnist_model_nemo2_fsdp.py
+0 −47 tests/core/test_exp_manager.py
+3 −36 tests/export/nemo_export.py
+0 −129 tests/lightning/test_ddp_parity_checker.py
+3 −33 tests/lightning/test_nemo_logger.py
+0 −253 tutorials/multimodal/SDXL Tutorial.ipynb
+0 −478 tutorials/tts/Audio_Codec_Inference.ipynb
5 changes: 5 additions & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ theme:
- content.code.annotate
- navigation.tabs
- navigation.sections
- navigation.indexes
- navigation.instant
- navigation.instant.prefetch
- search.suggest
Expand Down Expand Up @@ -54,10 +55,14 @@ plugins:
- mkdocs-jupyter

markdown_extensions:
- pymdownx.details
- pymdownx.superfences
- pymdownx.snippets:
check_paths: true
restrict_base_path: False
base_path: "."
- def_list
- admonition

copyright: |
© Copyright 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
13 changes: 11 additions & 2 deletions docs/scripts/gen_ref_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,21 @@
parts = tuple(module_path.parts)

if parts[-1] == "__init__":
parts = parts[:-1]
# parts = parts[:-1]
continue # Don't generate ref pages for __init__.py
elif parts[-1] == "__main__":
continue
continue # Don't generate ref pages for __main__.py

with mkdocs_gen_files.open(full_doc_path, "w") as fd:
identifier = ".".join(parts)
print("::: " + identifier, file=fd)

mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root))

for path in sorted(src.rglob("*.md")):
doc_path = path.relative_to(src)
full_doc_path = Path("API_reference", doc_path)
with mkdocs_gen_files.open(full_doc_path, "w") as fd:
fd.write(path.read_text())
print(full_doc_path)
mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root))
1 change: 1 addition & 0 deletions scripts/download_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,4 +378,5 @@ def main(models: Optional[List[str]], data: Optional[List[str]]) -> None:
if not (args.models or args.data):
logging.warning("No models or data were selected to download.")
else:
logging.warning("This script is deprecated, use `bionemo.testing.data.load` instead.")
pstjohn marked this conversation as resolved.
Show resolved Hide resolved
main(models=args.models, data=args.data)
4 changes: 3 additions & 1 deletion scripts/singlecell/geneformer/test_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
from bionemo.llm.utils.datamodule_utils import parse_kwargs_to_arglist
from bionemo.testing import megatron_parallel_state_utils
from bionemo.testing.data.load import load
pstjohn marked this conversation as resolved.
Show resolved Hide resolved


# TODO(@jstjohn) use fixtures for pulling down data and checkpoints
Expand All @@ -39,7 +40,8 @@
# From here, we want to get to the root of the repository: _before_ sub-packages/
.parent.parent
).absolute()
data_path: Path = bionemo2_root / "test_data/cellxgene_2023-12-15_small/processed_data"
assert bionemo2_root != Path("/")
data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"


def test_bionemo2_rootdir():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from bionemo.llm.model.biobert.model import MegatronBioBertModel
from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping
from bionemo.testing import megatron_parallel_state_utils
from bionemo.testing.data.load import load


bionemo2_root: Path = (
Expand All @@ -47,7 +48,7 @@
.parent.parent
).absolute()
assert bionemo2_root != Path("/")
nemo1_checkpoint_path: Path = bionemo2_root / "models/protein/esm2nv/esm2nv_650M_converted.nemo"
nemo1_checkpoint_path: Path = load("esm2/nv_650m:1.0")


def reduce_hiddens(hiddens: Tensor, attention_mask: Tensor) -> Tensor:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from bionemo.example_model import lightning_basic as lb
from bionemo.testing import megatron_parallel_state_utils
from bionemo.testing.data.load import BIONEMO_CACHE_DIR


@pytest.mark.needs_gpu
Expand All @@ -33,5 +34,5 @@ def test_train_mnist_litautoencoder_with_megatron_strategy_single_gpu():
enable_nemo_ckpt_io=False,
)
trainer = nl.Trainer(accelerator="gpu", devices=1, strategy=strategy, max_steps=10, num_nodes=1)
data_module = lb.MNISTDataModule()
data_module = lb.MNISTDataModule(str(BIONEMO_CACHE_DIR))
trainer.fit(model, data_module)
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from bionemo.llm.model.biobert.model import BiobertSpecOption
from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping
from bionemo.testing import megatron_parallel_state_utils
from bionemo.testing.data.load import load
from bionemo.testing.utils import assert_matrix_correlation_above_value, assert_matrix_mape_below_value


Expand All @@ -54,11 +55,11 @@
.parent.parent
).absolute()
assert bionemo2_root != Path("/")
nemo1_checkpoint_path: Path = bionemo2_root / "models/singlecell/geneformer/geneformer-qa.nemo"
nemo1_release_checkpoint_path: Path = bionemo2_root / "models/singlecell/geneformer/geneformer-10M-240530.nemo"
nemo_1_per_layer_outputs_path: Path = bionemo2_root / "test_data/nemo1-test-outputs-geneformer-qa.pt"
nemo_1_expected_values_path: Path = bionemo2_root / "test_data/nemo1_geneformer_qa_test_golden_values.pt"
data_path: Path = bionemo2_root / "test_data/cellxgene_2023-12-15_small/processed_data"
nemo1_checkpoint_path: Path = load("geneformer/qa")
nemo1_release_checkpoint_path: Path = load("geneformer/10M_240530")
nemo_1_per_layer_outputs_path: Path = load("single_cell/nemo1-geneformer-per-layer-outputs")
nemo_1_expected_values_path: Path = load("single_cell/nemo1-geneformer-golden-vals")
data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"


CELLS_FOR_TEST: List[List[str]] = [
Expand Down
6 changes: 3 additions & 3 deletions sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import pytest

from bionemo.testing.data.load import load


@pytest.fixture
def test_directory() -> Path:
Expand All @@ -26,6 +28,4 @@ def test_directory() -> Path:
Returns:
A Path object that is the directory with test data.
"""
current_file = Path(__file__).resolve()
target_path = current_file.parents[5].absolute() / "test_data/scdl_data"
return target_path
return load("scdl/sample") / "scdl_data"
pstjohn marked this conversation as resolved.
Show resolved Hide resolved
8 changes: 8 additions & 0 deletions sub-packages/bionemo-testing/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
boto3
email-validator
platformdirs
pooch
pydantic>=2.7.0
pytest
pyyaml
ngcsdk
tqdm
75 changes: 75 additions & 0 deletions sub-packages/bionemo-testing/src/bionemo/testing/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# BioNeMo test data management

This library manages the downloading and caching of large or binary data files used in the documentation or test suite.
These files should not be committed directly to the repo, and instead should be loaded at test-time when they are
needed.

We currently support two locations for test data or saved models:

SwiftStack

: SwiftStack or `pbss` is an NVIDIA-internal, s3-compatible object store that allows for very large data and fast,
parallel read/writes. Most critically, `pbss` can be uploaded to without legal approvals for dataset redistribution.
These files will not be accessible by external collaborators.

[NGC](https://catalog.ngc.nvidia.com/)

: NGC hosts containers, models, and resources, some of which require authentication and others that are generally
available. This library uses the model and resource types to save test data and reference model weights. These items
are accessible by external collaborators, but require legal approval before re-distributing test data.


## Loading test or example data

Test data are specified via yaml files in `sub-packages/bionemo-testing/src/bionemo/testing/data/resources`. As an
example, in `esm2.yaml`:

```yaml
- tag: nv_650m:1.0
ngc: "nvidia/clara/esm2nv650m:1.0"
ngc_registry: model
pbss: "s3://bionemo-ci/models/esm2nv_650M_converted.nemo"
sha256: 1e38063cafa808306329428dd17ea6df78c9e5d6b3d2caf04237c555a1f131b7
owner: Farhad Ramezanghorbani <[email protected]>
description: >
A pretrained 650M parameter ESM2 model.
See https://ngc.nvidia.com/catalog/models/nvidia:clara:esm2nv650m.
```

To load these model weights during a test, use the [load][bionemo.testing.data.load.load] function with the filename and
tag of the desired asset, which returns a path a the specified file:

```python
path_to_my_checkpoint = load("esm2/nv_650m:1.0")
config = ESM2Config(nemo1_ckpt_path=path_to_my_checkpoint)
```

If this function is called without the data available on the local machine, it will be fetched from the default source
(currently `pbss`.) Otherwise, it will return the cached directory. To download with NGC, pass `source="ngc"` to
[load][bionemo.testing.data.load.load].

## File unpacking and/or decompression

All test artifacts are individual files. If a zip or tar archive is specified, it will be unpacked automatically, and
the path to the directory will be returned via [load][bionemo.testing.data.load.load]. Compressed files ('gzip', 'bz2',
or 'xz') are automatically decompressed before they are returned. The file's compression and/or archive format is
determined based on the filename specified in the `pbss` URL.

!!! note "Files in NGC resources"

NGC resources are folders, i.e., they may contain multiple files per resource.
[load][bionemo.testing.data.load.load] will _only_ download the filename matching the stem of the `pbss` url. The
same NGC resource can therefore be used to host multiple test assets that are used independently.


## Adding new test assets

To add new data, first ensure that the data is available from either NGC or `pbss`. Next, extend or create a new yaml
file in `sub-packages/bionemo-testing/src/bionemo/testing/data/resources` with the required information. Owner emails
must be provided for all assets. The description and `ngc` fields are currently optional. If the `sha256` is left
unspecified, `pooch` will report the downloaded file's sha when loaded.

!!! warning

SHAs should be provided for all files to ensure the download completes correctly, and to invalidate caches if the
files change.
14 changes: 14 additions & 0 deletions sub-packages/bionemo-testing/src/bionemo/testing/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-Apache2
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Loading
Loading