From 17ed94a019f38eb5a92144d246ea5493caea1df5 Mon Sep 17 00:00:00 2001 From: James Webber Date: Thu, 6 Apr 2023 17:50:16 -0400 Subject: [PATCH 1/2] factored models into json files --- MANIFEST.in | 1 + src/longbow/preconfigured_models/__init__.py | 0 .../preconfigured_models/array/isoseq.json | 14 + .../preconfigured_models/array/mas_10.json | 32 +++ .../preconfigured_models/array/mas_15.json | 42 +++ .../preconfigured_models/array/mas_16.json | 44 +++ .../preconfigured_models/cdna/bulk_10x5p.json | 57 ++++ .../cdna/bulk_teloprimeV2.json | 40 +++ .../preconfigured_models/cdna/sc_10x3p.json | 59 ++++ .../preconfigured_models/cdna/sc_10x5p.json | 61 +++++ .../cdna/spatial_slideseq.json | 68 +++++ src/longbow/utils/model_utils.py | 256 ++---------------- 12 files changed, 446 insertions(+), 228 deletions(-) create mode 100644 src/longbow/preconfigured_models/__init__.py create mode 100644 src/longbow/preconfigured_models/array/isoseq.json create mode 100644 src/longbow/preconfigured_models/array/mas_10.json create mode 100644 src/longbow/preconfigured_models/array/mas_15.json create mode 100644 src/longbow/preconfigured_models/array/mas_16.json create mode 100644 src/longbow/preconfigured_models/cdna/bulk_10x5p.json create mode 100644 src/longbow/preconfigured_models/cdna/bulk_teloprimeV2.json create mode 100644 src/longbow/preconfigured_models/cdna/sc_10x3p.json create mode 100644 src/longbow/preconfigured_models/cdna/sc_10x5p.json create mode 100644 src/longbow/preconfigured_models/cdna/spatial_slideseq.json diff --git a/MANIFEST.in b/MANIFEST.in index ba0de15f..2141ac20 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,6 +4,7 @@ graft src include README.rst include LICENSE.txt include tox.ini .travis.yml +include src/longbow/preconfigured_models/**/*.json prune **/.hypothesis diff --git a/src/longbow/preconfigured_models/__init__.py b/src/longbow/preconfigured_models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/longbow/preconfigured_models/array/isoseq.json b/src/longbow/preconfigured_models/array/isoseq.json new file mode 100644 index 00000000..a6321b32 --- /dev/null +++ b/src/longbow/preconfigured_models/array/isoseq.json @@ -0,0 +1,14 @@ +{ + "description": "PacBio IsoSeq model", + "version": "3.0.0", + "structure": [ + "V", + "M" + ], + "adapters": { + "V": "TCTACACGACGCTCTTCCGATCT", + "M": "GTACTCTGCGTTGATACCACTGCTT" + }, + "deprecated": false, + "name": "isoseq" +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/array/mas_10.json b/src/longbow/preconfigured_models/array/mas_10.json new file mode 100644 index 00000000..456079fb --- /dev/null +++ b/src/longbow/preconfigured_models/array/mas_10.json @@ -0,0 +1,32 @@ +{ + "description": "10-element MAS-ISO-seq array", + "version": "3.0.0", + "structure": [ + "Q", + "C", + "M", + "I", + "O", + "J", + "B", + "D", + "K", + "H", + "R" + ], + "adapters": { + "Q": "AAGCACCATAATGTGT", + "C": "ACTCTGTCAGGTCCGA", + "M": "ACCTAGATCAGAGCCT", + "I": "AGTGCGTTGCGAATTG", + "O": "AAGTCACCGGCACCTT", + "J": "AATTGCGTAGTTGGCC", + "B": "ACTTGTAAGCTGTCTA", + "D": "ACCTCCTCCTCCAGAA", + "K": "ACACTTGGTCGCAATC", + "H": "ATGTTGAATCCTAGCG", + "R": "AACCGGACACACTTAG" + }, + "deprecated": false, + "name": "mas_10" +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/array/mas_15.json b/src/longbow/preconfigured_models/array/mas_15.json new file mode 100644 index 00000000..b1aae4f8 --- /dev/null +++ b/src/longbow/preconfigured_models/array/mas_15.json @@ -0,0 +1,42 @@ +{ + "description": "15-element MAS-ISO-seq array", + "version": "3.0.0", + "structure": [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P" + ], + "adapters": { + "A": "AGCTTACTTGTGAAGA", + "B": "ACTTGTAAGCTGTCTA", + "C": "ACTCTGTCAGGTCCGA", + "D": "ACCTCCTCCTCCAGAA", + "E": "AACCGGACACACTTAG", + "F": "AGAGTCCAATTCGCAG", + "G": "AATCAAGGCTTAACGG", + "H": "ATGTTGAATCCTAGCG", + "I": "AGTGCGTTGCGAATTG", + "J": "AATTGCGTAGTTGGCC", + "K": "ACACTTGGTCGCAATC", + "L": "AGTAAGCCTTCGTGTC", + "M": "ACCTAGATCAGAGCCT", + "N": "AGGTATGCCGGTTAAG", + "O": "AAGTCACCGGCACCTT", + "P": "ATGAAGTGGCTCGAGA" + }, + "deprecated": false, + "name": "mas_15" +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/array/mas_16.json b/src/longbow/preconfigured_models/array/mas_16.json new file mode 100644 index 00000000..0b9ef39c --- /dev/null +++ b/src/longbow/preconfigured_models/array/mas_16.json @@ -0,0 +1,44 @@ +{ + "description": "16-element MAS-ISO-seq array", + "version": "3.0.0", + "structure": [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q" + ], + "adapters": { + "A": "AGCTTACTTGTGAAGA", + "B": "ACTTGTAAGCTGTCTA", + "C": "ACTCTGTCAGGTCCGA", + "D": "ACCTCCTCCTCCAGAA", + "E": "AACCGGACACACTTAG", + "F": "AGAGTCCAATTCGCAG", + "G": "AATCAAGGCTTAACGG", + "H": "ATGTTGAATCCTAGCG", + "I": "AGTGCGTTGCGAATTG", + "J": "AATTGCGTAGTTGGCC", + "K": "ACACTTGGTCGCAATC", + "L": "AGTAAGCCTTCGTGTC", + "M": "ACCTAGATCAGAGCCT", + "N": "AGGTATGCCGGTTAAG", + "O": "AAGTCACCGGCACCTT", + "P": "ATGAAGTGGCTCGAGA", + "Q": "AGTAGCTGTGTGCA" + }, + "deprecated": false, + "name": "mas_16" +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/bulk_10x5p.json b/src/longbow/preconfigured_models/cdna/bulk_10x5p.json new file mode 100644 index 00000000..16b50d05 --- /dev/null +++ b/src/longbow/preconfigured_models/cdna/bulk_10x5p.json @@ -0,0 +1,57 @@ +{ + "description": "bulk 10x 5' kit", + "version": "3.0.0", + "structure": [ + "5p_Adapter", + "UMI", + "SLS", + "cDNA", + "Poly_A", + "sample_index", + "3p_Adapter" + ], + "adapters": { + "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", + "UMI": { + "FixedLengthRandomBases": 10 + }, + "SLS": "TTTCTTATATGGG", + "cDNA": "random", + "Poly_A": { + "HomopolymerRepeat": [ + "A", + 30 + ] + }, + "sample_index": { + "FixedLengthRandomBases": 10 + }, + "3p_Adapter": "CTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "UMI", + "cDNA", + "sample_index" + ], + "coding_region": "cDNA", + "annotation_segments": { + "UMI": [ + [ + "ZU", + "XU" + ], + [ + "XM", + "XU" + ] + ], + "sample_index": [ + [ + "id", + "ip" + ] + ] + }, + "deprecated": false, + "name": "bulk_10x5p" +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/bulk_teloprimeV2.json b/src/longbow/preconfigured_models/cdna/bulk_teloprimeV2.json new file mode 100644 index 00000000..f3d02b47 --- /dev/null +++ b/src/longbow/preconfigured_models/cdna/bulk_teloprimeV2.json @@ -0,0 +1,40 @@ +{ + "description": "Lexogen TeloPrime V2 kit", + "version": "3.0.0", + "structure": [ + "TPV2_adapter", + "cDNA", + "Poly_A", + "idx", + "rev_bind" + ], + "adapters": { + "TPV2_adapter": "CTACACGACGCTCTTCCGATCTTGGATTGATATGTAATACGACTCACTATAG", + "cDNA": "random", + "Poly_A": { + "HomopolymerRepeat": [ + "A", + 30 + ] + }, + "idx": { + "FixedLengthRandomBases": 10 + }, + "rev_bind": "CTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "idx", + "cDNA" + ], + "coding_region": "cDNA", + "annotation_segments": { + "idx": [ + [ + "BC", + "XB" + ] + ] + }, + "deprecated": false, + "name": "bulk_teloprimeV2" +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/sc_10x3p.json b/src/longbow/preconfigured_models/cdna/sc_10x3p.json new file mode 100644 index 00000000..04f5105a --- /dev/null +++ b/src/longbow/preconfigured_models/cdna/sc_10x3p.json @@ -0,0 +1,59 @@ +{ + "description": "single-cell 10x 3' kit", + "version": "3.0.0", + "structure": [ + "5p_Adapter", + "CBC", + "UMI", + "Poly_T", + "cDNA", + "3p_Adapter" + ], + "adapters": { + "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", + "CBC": { + "FixedLengthRandomBases": 16 + }, + "UMI": { + "FixedLengthRandomBases": 12 + }, + "Poly_T": { + "HomopolymerRepeat": [ + "T", + 30 + ] + }, + "cDNA": "random", + "3p_Adapter": "CCCATGTACTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "CBC", + "UMI", + "cDNA" + ], + "coding_region": "cDNA", + "annotation_segments": { + "UMI": [ + [ + "ZU", + "XU" + ], + [ + "XM", + "XU" + ] + ], + "CBC": [ + [ + "CR", + "XB" + ], + [ + "XC", + "XB" + ] + ] + }, + "deprecated": false, + "name": "sc_10x3p" +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/sc_10x5p.json b/src/longbow/preconfigured_models/cdna/sc_10x5p.json new file mode 100644 index 00000000..f21576a2 --- /dev/null +++ b/src/longbow/preconfigured_models/cdna/sc_10x5p.json @@ -0,0 +1,61 @@ +{ + "description": "single-cell 10x 5' kit", + "version": "3.0.0", + "structure": [ + "5p_Adapter", + "CBC", + "UMI", + "SLS", + "cDNA", + "Poly_A", + "3p_Adapter" + ], + "adapters": { + "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", + "CBC": { + "FixedLengthRandomBases": 16 + }, + "UMI": { + "FixedLengthRandomBases": 10 + }, + "SLS": "TTTCTTATATGGG", + "cDNA": "random", + "Poly_A": { + "HomopolymerRepeat": [ + "A", + 30 + ] + }, + "3p_Adapter": "GTACTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "CBC", + "UMI", + "cDNA" + ], + "coding_region": "cDNA", + "annotation_segments": { + "UMI": [ + [ + "ZU", + "XU" + ], + [ + "XM", + "XU" + ] + ], + "CBC": [ + [ + "CR", + "XB" + ], + [ + "XC", + "XB" + ] + ] + }, + "deprecated": false, + "name": "sc_10x5p" +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/spatial_slideseq.json b/src/longbow/preconfigured_models/cdna/spatial_slideseq.json new file mode 100644 index 00000000..3ace8e65 --- /dev/null +++ b/src/longbow/preconfigured_models/cdna/spatial_slideseq.json @@ -0,0 +1,68 @@ +{ + "description": "Slide-seq protocol", + "version": "3.0.0", + "structure": [ + "5p_Adapter", + "SBC2", + "SLS2", + "SBC1", + "UMI", + "Poly_T", + "cDNA", + "3p_Adapter" + ], + "adapters": { + "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", + "SBC2": { + "FixedLengthRandomBases": 8 + }, + "SLS2": "TCTTCAGCGTTCCCGAGA", + "SBC1": { + "FixedLengthRandomBases": 6 + }, + "UMI": { + "FixedLengthRandomBases": 9 + }, + "Poly_T": { + "HomopolymerRepeat": [ + "T", + 30 + ] + }, + "cDNA": "random", + "3p_Adapter": "CCCATGTACTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "UMI", + "SBC2", + "SBC1", + "cDNA" + ], + "coding_region": "cDNA", + "annotation_segments": { + "UMI": [ + [ + "ZU", + "XU" + ], + [ + "XM", + "XU" + ] + ], + "SBC1": [ + [ + "X1", + "XP" + ] + ], + "SBC2": [ + [ + "X2", + "XR" + ] + ] + }, + "deprecated": false, + "name": "spatial_slideseq" +} \ No newline at end of file diff --git a/src/longbow/utils/model_utils.py b/src/longbow/utils/model_utils.py index b1eff308..c67b68f4 100644 --- a/src/longbow/utils/model_utils.py +++ b/src/longbow/utils/model_utils.py @@ -1,14 +1,20 @@ import sys import re +import importlib.resources +import json import logging import click_log from pomegranate import * -import longbow.utils.constants -from .constants import RANDOM_SEGMENT_NAME, FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME, HPR_SEGMENT_TYPE_NAME, \ - RANDOM_SILENT_STATE_A, RANDOM_SILENT_STATE_B, RANDOM_BASE_STATE, BAKE_MERGE_STRATEGY +from .constants import ( + RANDOM_SEGMENT_NAME, + RANDOM_SILENT_STATE_A, + RANDOM_SILENT_STATE_B, + RANDOM_BASE_STATE, + BAKE_MERGE_STRATEGY, +) logging.basicConfig(stream=sys.stderr) logger = logging.getLogger(__name__) @@ -17,6 +23,23 @@ starts_with_number_re = re.compile(r"^\d") +def load_preconfigured_models(): + pre_configured_models = {"array": {}, "cdna": {}} + + with importlib.resources.path("longbow", "preconfigured_models") as model_dir: + for json_file in (model_dir / "array").glob("*json"): + with json_file.open() as fh: + m = json.load(fh) + pre_configured_models["array"][m["name"]] = m + + for json_file in (model_dir / "cdna").glob("*json"): + with json_file.open() as fh: + m = json.load(fh) + pre_configured_models["cdna"][m["name"]] = m + + return pre_configured_models + + class ModelBuilder: """Utilities for constructing a full Longbow model.""" @@ -56,6 +79,8 @@ class ModelBuilder: SUDDEN_END_PROB = 0.01 MATCH_END_PROB = 0.1 + pre_configured_models = load_preconfigured_models() + @staticmethod def make_global_alignment_model(target, name=None): logger.debug("Making Model: GLOBAL_ALIGNMENT (%s)", name) @@ -349,228 +374,3 @@ def connect_terminals(base_hmm, adapter_name_i, adapter_name_j, transition_proba # base_hmm.bake(merge=BAKE_MERGE_STRATEGY) return base_hmm - - pre_configured_models = { - 'array': { - "mas_16": { - "description": "16-element MAS-ISO-seq array", - "version": "3.0.0", - "structure": ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"], - "adapters": { - "A": "AGCTTACTTGTGAAGA", - "B": "ACTTGTAAGCTGTCTA", - "C": "ACTCTGTCAGGTCCGA", - "D": "ACCTCCTCCTCCAGAA", - "E": "AACCGGACACACTTAG", - "F": "AGAGTCCAATTCGCAG", - "G": "AATCAAGGCTTAACGG", - "H": "ATGTTGAATCCTAGCG", - "I": "AGTGCGTTGCGAATTG", - "J": "AATTGCGTAGTTGGCC", - "K": "ACACTTGGTCGCAATC", - "L": "AGTAAGCCTTCGTGTC", - "M": "ACCTAGATCAGAGCCT", - "N": "AGGTATGCCGGTTAAG", - "O": "AAGTCACCGGCACCTT", - "P": "ATGAAGTGGCTCGAGA", - "Q": "AGTAGCTGTGTGCA", - }, - "deprecated": False, - "name": "mas_16", - }, - - "mas_15": { - "description": "15-element MAS-ISO-seq array", - "version": "3.0.0", - "structure": ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"], - "adapters": { - "A": "AGCTTACTTGTGAAGA", - "B": "ACTTGTAAGCTGTCTA", - "C": "ACTCTGTCAGGTCCGA", - "D": "ACCTCCTCCTCCAGAA", - "E": "AACCGGACACACTTAG", - "F": "AGAGTCCAATTCGCAG", - "G": "AATCAAGGCTTAACGG", - "H": "ATGTTGAATCCTAGCG", - "I": "AGTGCGTTGCGAATTG", - "J": "AATTGCGTAGTTGGCC", - "K": "ACACTTGGTCGCAATC", - "L": "AGTAAGCCTTCGTGTC", - "M": "ACCTAGATCAGAGCCT", - "N": "AGGTATGCCGGTTAAG", - "O": "AAGTCACCGGCACCTT", - "P": "ATGAAGTGGCTCGAGA", - }, - "deprecated": False, - "name": "mas_15", - }, - - "mas_10": { - "description": "10-element MAS-ISO-seq array", - "version": "3.0.0", - "structure": ["Q", "C", "M", "I", "O", "J", "B", "D", "K", "H", "R"], - "adapters": { - "Q": "AAGCACCATAATGTGT", - "C": "ACTCTGTCAGGTCCGA", - "M": "ACCTAGATCAGAGCCT", - "I": "AGTGCGTTGCGAATTG", - "O": "AAGTCACCGGCACCTT", - "J": "AATTGCGTAGTTGGCC", - "B": "ACTTGTAAGCTGTCTA", - "D": "ACCTCCTCCTCCAGAA", - "K": "ACACTTGGTCGCAATC", - "H": "ATGTTGAATCCTAGCG", - "R": "AACCGGACACACTTAG", - }, - "deprecated": False, - "name": "mas_10", - }, - - "isoseq": { - "description": "PacBio IsoSeq model", - "version": "3.0.0", - "structure": ["V", "M"], - "adapters": { - "V": "TCTACACGACGCTCTTCCGATCT", - "M": "GTACTCTGCGTTGATACCACTGCTT", - }, - "deprecated": False, - "name": "isoseq", - }, - }, - - 'cdna': { - "sc_10x3p": { - "description": "single-cell 10x 3' kit", - "version": "3.0.0", - "structure": ["5p_Adapter", "CBC", "UMI", "Poly_T", "cDNA", "3p_Adapter"], - "adapters": { - "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", - "CBC": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 16}, - "UMI": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 12}, - "Poly_T": {HPR_SEGMENT_TYPE_NAME: ("T", 30)}, - "cDNA": RANDOM_SEGMENT_NAME, - "3p_Adapter": "CCCATGTACTCTGCGTTGATACCACTGCTT", - }, - "named_random_segments": ["CBC", "UMI", "cDNA"], - "coding_region": "cDNA", - "annotation_segments": { - "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), - (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)], - "CBC": [(longbow.utils.constants.READ_BARCODE_TAG, - longbow.utils.constants.READ_BARCODE_POS_TAG), - (longbow.utils.constants.READ_RAW_BARCODE_TAG, - longbow.utils.constants.READ_BARCODE_POS_TAG)], - }, - "deprecated": False, - "name": "sc_10x3p", - }, - - "sc_10x5p": { - "description": "single-cell 10x 5' kit", - "version": "3.0.0", - "structure": ["5p_Adapter", "CBC", "UMI", "SLS", "cDNA", "Poly_A", "3p_Adapter"], - "adapters": { - "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", - "CBC": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 16}, - "UMI": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 10}, - "SLS": "TTTCTTATATGGG", # Switch Leader Seq - "cDNA": RANDOM_SEGMENT_NAME, - "Poly_A": {HPR_SEGMENT_TYPE_NAME: ("A", 30)}, - "3p_Adapter": "GTACTCTGCGTTGATACCACTGCTT", - }, - "named_random_segments": ["CBC", "UMI", "cDNA"], - "coding_region": "cDNA", - "annotation_segments": { - "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), - (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)], - "CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), - (longbow.utils.constants.READ_RAW_BARCODE_TAG, - longbow.utils.constants.READ_BARCODE_POS_TAG)], - }, - "deprecated": False, - "name": "sc_10x5p", - }, - - "bulk_10x5p": { - "description": "bulk 10x 5' kit", - "version": "3.0.0", - "structure": ["5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter"], - "adapters": { - "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", - "UMI": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 10}, - "SLS": "TTTCTTATATGGG", # Switch Leader Seq - "cDNA": RANDOM_SEGMENT_NAME, - "Poly_A": {HPR_SEGMENT_TYPE_NAME: ("A", 30)}, - "sample_index": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 10}, - "3p_Adapter": "CTCTGCGTTGATACCACTGCTT", - }, - "named_random_segments": ["UMI", "cDNA", "sample_index"], - "coding_region": "cDNA", - "annotation_segments": { - "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), - (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)], - "sample_index": [(longbow.utils.constants.READ_DEMUX_TAG, - longbow.utils.constants.READ_DEMUX_POS_TAG)], - }, - "deprecated": False, - "name": "bulk_10x5p", - }, - - "bulk_teloprimeV2": { - "description": "Lexogen TeloPrime V2 kit", - "version": "3.0.0", - "structure": ["TPV2_adapter", "cDNA", "Poly_A", "idx", "rev_bind"], - "adapters": { - "TPV2_adapter": "CTACACGACGCTCTTCCGATCTTGGATTGATATGTAATACGACTCACTATAG", - "cDNA": RANDOM_SEGMENT_NAME, - "Poly_A": {HPR_SEGMENT_TYPE_NAME: ("A", 30)}, - "idx": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 10}, - "rev_bind": "CTCTGCGTTGATACCACTGCTT", - }, - "named_random_segments": ["idx", "cDNA"], - "coding_region": "cDNA", - "annotation_segments": { - "idx": [(longbow.utils.constants.READ_INDEX_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)], - }, - "deprecated": False, - "name": "bulk_teloprimeV2", - }, - - # The slide-seq model is: - # - # |-----5p_Adapter----> |--splitter------> |------Poly_T----------------> |--------5p_Adapter----------| # noqa - # AGCTTACTTGTGAAGACTACACGACGCTCTTCCGATCTNNNNNNNNTCTTCAGCGTTCCCGAGANNNNNNNNNNNNNVVTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTVNNNNNNNNNNNNNNNNNCCCATGTACTCTGCGTTGATACCACTGCTTACTTGTAAGCTGTCTA... # noqa - # |------A-------> <------| <-----------| <----cDNA-------| |-------B------> # noqa - # V V - # Spatial Barcode 2 Spatial Barcode 1 - "spatial_slideseq": { - "description": "Slide-seq protocol", - "version": "3.0.0", - "structure": ["5p_Adapter", "SBC2", "SLS2", "SBC1", "UMI", "Poly_T", "cDNA", "3p_Adapter"], - "adapters": { - "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", - "SBC2": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 8}, - "SLS2": "TCTTCAGCGTTCCCGAGA", # Switch Leader Seq - "SBC1": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 6}, - # The UMI might be 7, rather than 9 elements long - not clear from the geneious file. - "UMI": {FIXED_LENGTH_RANDOM_SEGMENT_TYPE_NAME: 9}, - "Poly_T": {HPR_SEGMENT_TYPE_NAME: ("T", 30)}, - "cDNA": RANDOM_SEGMENT_NAME, - "3p_Adapter": "CCCATGTACTCTGCGTTGATACCACTGCTT", - }, - "named_random_segments": ["UMI", "SBC2", "SBC1", "cDNA"], - "coding_region": "cDNA", - "annotation_segments": { - "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), - (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)], - "SBC1": [(longbow.utils.constants.READ_SPATIAL_BARCODE1_TAG, - longbow.utils.constants.READ_SPATIAL_BARCODE1_POS_TAG)], - "SBC2": [(longbow.utils.constants.READ_SPATIAL_BARCODE2_TAG, - longbow.utils.constants.READ_SPATIAL_BARCODE2_POS_TAG)], - }, - "deprecated": False, - "name": "spatial_slideseq", - }, - } - } From dcb91e7fd52dc205de235d8d189615823b4191d4 Mon Sep 17 00:00:00 2001 From: James Webber Date: Thu, 6 Apr 2023 17:50:16 -0400 Subject: [PATCH 2/2] renamed json dir to 'models' --- MANIFEST.in | 2 +- .../__init__.py | 0 src/longbow/models/bulk_10x5p.json | 59 ++++++++++++++++ src/longbow/models/bulk_teloprimeV2.json | 42 +++++++++++ src/longbow/models/isoseq.json | 16 +++++ src/longbow/models/mas_10.json | 34 +++++++++ src/longbow/models/mas_15.json | 44 ++++++++++++ src/longbow/models/mas_16.json | 46 ++++++++++++ src/longbow/models/sc_10x3p.json | 61 ++++++++++++++++ src/longbow/models/sc_10x5p.json | 63 +++++++++++++++++ src/longbow/models/spatial_slideseq.json | 70 +++++++++++++++++++ .../preconfigured_models/array/isoseq.json | 14 ---- .../preconfigured_models/array/mas_10.json | 32 --------- .../preconfigured_models/array/mas_15.json | 42 ----------- .../preconfigured_models/array/mas_16.json | 44 ------------ .../preconfigured_models/cdna/bulk_10x5p.json | 57 --------------- .../cdna/bulk_teloprimeV2.json | 40 ----------- .../preconfigured_models/cdna/sc_10x3p.json | 59 ---------------- .../preconfigured_models/cdna/sc_10x5p.json | 61 ---------------- .../cdna/spatial_slideseq.json | 68 ------------------ src/longbow/utils/model_utils.py | 22 +++--- 21 files changed, 446 insertions(+), 430 deletions(-) rename src/longbow/{preconfigured_models => models}/__init__.py (100%) create mode 100644 src/longbow/models/bulk_10x5p.json create mode 100644 src/longbow/models/bulk_teloprimeV2.json create mode 100644 src/longbow/models/isoseq.json create mode 100644 src/longbow/models/mas_10.json create mode 100644 src/longbow/models/mas_15.json create mode 100644 src/longbow/models/mas_16.json create mode 100644 src/longbow/models/sc_10x3p.json create mode 100644 src/longbow/models/sc_10x5p.json create mode 100644 src/longbow/models/spatial_slideseq.json delete mode 100644 src/longbow/preconfigured_models/array/isoseq.json delete mode 100644 src/longbow/preconfigured_models/array/mas_10.json delete mode 100644 src/longbow/preconfigured_models/array/mas_15.json delete mode 100644 src/longbow/preconfigured_models/array/mas_16.json delete mode 100644 src/longbow/preconfigured_models/cdna/bulk_10x5p.json delete mode 100644 src/longbow/preconfigured_models/cdna/bulk_teloprimeV2.json delete mode 100644 src/longbow/preconfigured_models/cdna/sc_10x3p.json delete mode 100644 src/longbow/preconfigured_models/cdna/sc_10x5p.json delete mode 100644 src/longbow/preconfigured_models/cdna/spatial_slideseq.json diff --git a/MANIFEST.in b/MANIFEST.in index 2141ac20..6d155adc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,7 +4,7 @@ graft src include README.rst include LICENSE.txt include tox.ini .travis.yml -include src/longbow/preconfigured_models/**/*.json +include src/longbow/models/*.json prune **/.hypothesis diff --git a/src/longbow/preconfigured_models/__init__.py b/src/longbow/models/__init__.py similarity index 100% rename from src/longbow/preconfigured_models/__init__.py rename to src/longbow/models/__init__.py diff --git a/src/longbow/models/bulk_10x5p.json b/src/longbow/models/bulk_10x5p.json new file mode 100644 index 00000000..84931461 --- /dev/null +++ b/src/longbow/models/bulk_10x5p.json @@ -0,0 +1,59 @@ +{ + "cdna": { + "description": "bulk 10x 5' kit", + "version": "3.0.0", + "structure": [ + "5p_Adapter", + "UMI", + "SLS", + "cDNA", + "Poly_A", + "sample_index", + "3p_Adapter" + ], + "adapters": { + "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", + "UMI": { + "FixedLengthRandomBases": 10 + }, + "SLS": "TTTCTTATATGGG", + "cDNA": "random", + "Poly_A": { + "HomopolymerRepeat": [ + "A", + 30 + ] + }, + "sample_index": { + "FixedLengthRandomBases": 10 + }, + "3p_Adapter": "CTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "UMI", + "cDNA", + "sample_index" + ], + "coding_region": "cDNA", + "annotation_segments": { + "UMI": [ + [ + "ZU", + "XU" + ], + [ + "XM", + "XU" + ] + ], + "sample_index": [ + [ + "id", + "ip" + ] + ] + }, + "deprecated": false, + "name": "bulk_10x5p" + } +} \ No newline at end of file diff --git a/src/longbow/models/bulk_teloprimeV2.json b/src/longbow/models/bulk_teloprimeV2.json new file mode 100644 index 00000000..9b0004bd --- /dev/null +++ b/src/longbow/models/bulk_teloprimeV2.json @@ -0,0 +1,42 @@ +{ + "cdna": { + "description": "Lexogen TeloPrime V2 kit", + "version": "3.0.0", + "structure": [ + "TPV2_adapter", + "cDNA", + "Poly_A", + "idx", + "rev_bind" + ], + "adapters": { + "TPV2_adapter": "CTACACGACGCTCTTCCGATCTTGGATTGATATGTAATACGACTCACTATAG", + "cDNA": "random", + "Poly_A": { + "HomopolymerRepeat": [ + "A", + 30 + ] + }, + "idx": { + "FixedLengthRandomBases": 10 + }, + "rev_bind": "CTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "idx", + "cDNA" + ], + "coding_region": "cDNA", + "annotation_segments": { + "idx": [ + [ + "BC", + "XB" + ] + ] + }, + "deprecated": false, + "name": "bulk_teloprimeV2" + } +} \ No newline at end of file diff --git a/src/longbow/models/isoseq.json b/src/longbow/models/isoseq.json new file mode 100644 index 00000000..2b050fdd --- /dev/null +++ b/src/longbow/models/isoseq.json @@ -0,0 +1,16 @@ +{ + "array": { + "description": "PacBio IsoSeq model", + "version": "3.0.0", + "structure": [ + "V", + "M" + ], + "adapters": { + "V": "TCTACACGACGCTCTTCCGATCT", + "M": "GTACTCTGCGTTGATACCACTGCTT" + }, + "deprecated": false, + "name": "isoseq" + } +} \ No newline at end of file diff --git a/src/longbow/models/mas_10.json b/src/longbow/models/mas_10.json new file mode 100644 index 00000000..1e010488 --- /dev/null +++ b/src/longbow/models/mas_10.json @@ -0,0 +1,34 @@ +{ + "array": { + "description": "10-element MAS-ISO-seq array", + "version": "3.0.0", + "structure": [ + "Q", + "C", + "M", + "I", + "O", + "J", + "B", + "D", + "K", + "H", + "R" + ], + "adapters": { + "Q": "AAGCACCATAATGTGT", + "C": "ACTCTGTCAGGTCCGA", + "M": "ACCTAGATCAGAGCCT", + "I": "AGTGCGTTGCGAATTG", + "O": "AAGTCACCGGCACCTT", + "J": "AATTGCGTAGTTGGCC", + "B": "ACTTGTAAGCTGTCTA", + "D": "ACCTCCTCCTCCAGAA", + "K": "ACACTTGGTCGCAATC", + "H": "ATGTTGAATCCTAGCG", + "R": "AACCGGACACACTTAG" + }, + "deprecated": false, + "name": "mas_10" + } +} \ No newline at end of file diff --git a/src/longbow/models/mas_15.json b/src/longbow/models/mas_15.json new file mode 100644 index 00000000..0536574f --- /dev/null +++ b/src/longbow/models/mas_15.json @@ -0,0 +1,44 @@ +{ + "array": { + "description": "15-element MAS-ISO-seq array", + "version": "3.0.0", + "structure": [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P" + ], + "adapters": { + "A": "AGCTTACTTGTGAAGA", + "B": "ACTTGTAAGCTGTCTA", + "C": "ACTCTGTCAGGTCCGA", + "D": "ACCTCCTCCTCCAGAA", + "E": "AACCGGACACACTTAG", + "F": "AGAGTCCAATTCGCAG", + "G": "AATCAAGGCTTAACGG", + "H": "ATGTTGAATCCTAGCG", + "I": "AGTGCGTTGCGAATTG", + "J": "AATTGCGTAGTTGGCC", + "K": "ACACTTGGTCGCAATC", + "L": "AGTAAGCCTTCGTGTC", + "M": "ACCTAGATCAGAGCCT", + "N": "AGGTATGCCGGTTAAG", + "O": "AAGTCACCGGCACCTT", + "P": "ATGAAGTGGCTCGAGA" + }, + "deprecated": false, + "name": "mas_15" + } +} \ No newline at end of file diff --git a/src/longbow/models/mas_16.json b/src/longbow/models/mas_16.json new file mode 100644 index 00000000..c2623011 --- /dev/null +++ b/src/longbow/models/mas_16.json @@ -0,0 +1,46 @@ +{ + "array": { + "description": "16-element MAS-ISO-seq array", + "version": "3.0.0", + "structure": [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q" + ], + "adapters": { + "A": "AGCTTACTTGTGAAGA", + "B": "ACTTGTAAGCTGTCTA", + "C": "ACTCTGTCAGGTCCGA", + "D": "ACCTCCTCCTCCAGAA", + "E": "AACCGGACACACTTAG", + "F": "AGAGTCCAATTCGCAG", + "G": "AATCAAGGCTTAACGG", + "H": "ATGTTGAATCCTAGCG", + "I": "AGTGCGTTGCGAATTG", + "J": "AATTGCGTAGTTGGCC", + "K": "ACACTTGGTCGCAATC", + "L": "AGTAAGCCTTCGTGTC", + "M": "ACCTAGATCAGAGCCT", + "N": "AGGTATGCCGGTTAAG", + "O": "AAGTCACCGGCACCTT", + "P": "ATGAAGTGGCTCGAGA", + "Q": "AGTAGCTGTGTGCA" + }, + "deprecated": false, + "name": "mas_16" + } +} \ No newline at end of file diff --git a/src/longbow/models/sc_10x3p.json b/src/longbow/models/sc_10x3p.json new file mode 100644 index 00000000..afa4c2bd --- /dev/null +++ b/src/longbow/models/sc_10x3p.json @@ -0,0 +1,61 @@ +{ + "cdna": { + "description": "single-cell 10x 3' kit", + "version": "3.0.0", + "structure": [ + "5p_Adapter", + "CBC", + "UMI", + "Poly_T", + "cDNA", + "3p_Adapter" + ], + "adapters": { + "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", + "CBC": { + "FixedLengthRandomBases": 16 + }, + "UMI": { + "FixedLengthRandomBases": 12 + }, + "Poly_T": { + "HomopolymerRepeat": [ + "T", + 30 + ] + }, + "cDNA": "random", + "3p_Adapter": "CCCATGTACTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "CBC", + "UMI", + "cDNA" + ], + "coding_region": "cDNA", + "annotation_segments": { + "UMI": [ + [ + "ZU", + "XU" + ], + [ + "XM", + "XU" + ] + ], + "CBC": [ + [ + "CR", + "XB" + ], + [ + "XC", + "XB" + ] + ] + }, + "deprecated": false, + "name": "sc_10x3p" + } +} \ No newline at end of file diff --git a/src/longbow/models/sc_10x5p.json b/src/longbow/models/sc_10x5p.json new file mode 100644 index 00000000..35bf797d --- /dev/null +++ b/src/longbow/models/sc_10x5p.json @@ -0,0 +1,63 @@ +{ + "cdna": { + "description": "single-cell 10x 5' kit", + "version": "3.0.0", + "structure": [ + "5p_Adapter", + "CBC", + "UMI", + "SLS", + "cDNA", + "Poly_A", + "3p_Adapter" + ], + "adapters": { + "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", + "CBC": { + "FixedLengthRandomBases": 16 + }, + "UMI": { + "FixedLengthRandomBases": 10 + }, + "SLS": "TTTCTTATATGGG", + "cDNA": "random", + "Poly_A": { + "HomopolymerRepeat": [ + "A", + 30 + ] + }, + "3p_Adapter": "GTACTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "CBC", + "UMI", + "cDNA" + ], + "coding_region": "cDNA", + "annotation_segments": { + "UMI": [ + [ + "ZU", + "XU" + ], + [ + "XM", + "XU" + ] + ], + "CBC": [ + [ + "CR", + "XB" + ], + [ + "XC", + "XB" + ] + ] + }, + "deprecated": false, + "name": "sc_10x5p" + } +} \ No newline at end of file diff --git a/src/longbow/models/spatial_slideseq.json b/src/longbow/models/spatial_slideseq.json new file mode 100644 index 00000000..5597265e --- /dev/null +++ b/src/longbow/models/spatial_slideseq.json @@ -0,0 +1,70 @@ +{ + "cdna": { + "description": "Slide-seq protocol", + "version": "3.0.0", + "structure": [ + "5p_Adapter", + "SBC2", + "SLS2", + "SBC1", + "UMI", + "Poly_T", + "cDNA", + "3p_Adapter" + ], + "adapters": { + "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", + "SBC2": { + "FixedLengthRandomBases": 8 + }, + "SLS2": "TCTTCAGCGTTCCCGAGA", + "SBC1": { + "FixedLengthRandomBases": 6 + }, + "UMI": { + "FixedLengthRandomBases": 9 + }, + "Poly_T": { + "HomopolymerRepeat": [ + "T", + 30 + ] + }, + "cDNA": "random", + "3p_Adapter": "CCCATGTACTCTGCGTTGATACCACTGCTT" + }, + "named_random_segments": [ + "UMI", + "SBC2", + "SBC1", + "cDNA" + ], + "coding_region": "cDNA", + "annotation_segments": { + "UMI": [ + [ + "ZU", + "XU" + ], + [ + "XM", + "XU" + ] + ], + "SBC1": [ + [ + "X1", + "XP" + ] + ], + "SBC2": [ + [ + "X2", + "XR" + ] + ] + }, + "deprecated": false, + "name": "spatial_slideseq" + } +} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/array/isoseq.json b/src/longbow/preconfigured_models/array/isoseq.json deleted file mode 100644 index a6321b32..00000000 --- a/src/longbow/preconfigured_models/array/isoseq.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "description": "PacBio IsoSeq model", - "version": "3.0.0", - "structure": [ - "V", - "M" - ], - "adapters": { - "V": "TCTACACGACGCTCTTCCGATCT", - "M": "GTACTCTGCGTTGATACCACTGCTT" - }, - "deprecated": false, - "name": "isoseq" -} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/array/mas_10.json b/src/longbow/preconfigured_models/array/mas_10.json deleted file mode 100644 index 456079fb..00000000 --- a/src/longbow/preconfigured_models/array/mas_10.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "description": "10-element MAS-ISO-seq array", - "version": "3.0.0", - "structure": [ - "Q", - "C", - "M", - "I", - "O", - "J", - "B", - "D", - "K", - "H", - "R" - ], - "adapters": { - "Q": "AAGCACCATAATGTGT", - "C": "ACTCTGTCAGGTCCGA", - "M": "ACCTAGATCAGAGCCT", - "I": "AGTGCGTTGCGAATTG", - "O": "AAGTCACCGGCACCTT", - "J": "AATTGCGTAGTTGGCC", - "B": "ACTTGTAAGCTGTCTA", - "D": "ACCTCCTCCTCCAGAA", - "K": "ACACTTGGTCGCAATC", - "H": "ATGTTGAATCCTAGCG", - "R": "AACCGGACACACTTAG" - }, - "deprecated": false, - "name": "mas_10" -} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/array/mas_15.json b/src/longbow/preconfigured_models/array/mas_15.json deleted file mode 100644 index b1aae4f8..00000000 --- a/src/longbow/preconfigured_models/array/mas_15.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "description": "15-element MAS-ISO-seq array", - "version": "3.0.0", - "structure": [ - "A", - "B", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "J", - "K", - "L", - "M", - "N", - "O", - "P" - ], - "adapters": { - "A": "AGCTTACTTGTGAAGA", - "B": "ACTTGTAAGCTGTCTA", - "C": "ACTCTGTCAGGTCCGA", - "D": "ACCTCCTCCTCCAGAA", - "E": "AACCGGACACACTTAG", - "F": "AGAGTCCAATTCGCAG", - "G": "AATCAAGGCTTAACGG", - "H": "ATGTTGAATCCTAGCG", - "I": "AGTGCGTTGCGAATTG", - "J": "AATTGCGTAGTTGGCC", - "K": "ACACTTGGTCGCAATC", - "L": "AGTAAGCCTTCGTGTC", - "M": "ACCTAGATCAGAGCCT", - "N": "AGGTATGCCGGTTAAG", - "O": "AAGTCACCGGCACCTT", - "P": "ATGAAGTGGCTCGAGA" - }, - "deprecated": false, - "name": "mas_15" -} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/array/mas_16.json b/src/longbow/preconfigured_models/array/mas_16.json deleted file mode 100644 index 0b9ef39c..00000000 --- a/src/longbow/preconfigured_models/array/mas_16.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "description": "16-element MAS-ISO-seq array", - "version": "3.0.0", - "structure": [ - "A", - "B", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "J", - "K", - "L", - "M", - "N", - "O", - "P", - "Q" - ], - "adapters": { - "A": "AGCTTACTTGTGAAGA", - "B": "ACTTGTAAGCTGTCTA", - "C": "ACTCTGTCAGGTCCGA", - "D": "ACCTCCTCCTCCAGAA", - "E": "AACCGGACACACTTAG", - "F": "AGAGTCCAATTCGCAG", - "G": "AATCAAGGCTTAACGG", - "H": "ATGTTGAATCCTAGCG", - "I": "AGTGCGTTGCGAATTG", - "J": "AATTGCGTAGTTGGCC", - "K": "ACACTTGGTCGCAATC", - "L": "AGTAAGCCTTCGTGTC", - "M": "ACCTAGATCAGAGCCT", - "N": "AGGTATGCCGGTTAAG", - "O": "AAGTCACCGGCACCTT", - "P": "ATGAAGTGGCTCGAGA", - "Q": "AGTAGCTGTGTGCA" - }, - "deprecated": false, - "name": "mas_16" -} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/bulk_10x5p.json b/src/longbow/preconfigured_models/cdna/bulk_10x5p.json deleted file mode 100644 index 16b50d05..00000000 --- a/src/longbow/preconfigured_models/cdna/bulk_10x5p.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "description": "bulk 10x 5' kit", - "version": "3.0.0", - "structure": [ - "5p_Adapter", - "UMI", - "SLS", - "cDNA", - "Poly_A", - "sample_index", - "3p_Adapter" - ], - "adapters": { - "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", - "UMI": { - "FixedLengthRandomBases": 10 - }, - "SLS": "TTTCTTATATGGG", - "cDNA": "random", - "Poly_A": { - "HomopolymerRepeat": [ - "A", - 30 - ] - }, - "sample_index": { - "FixedLengthRandomBases": 10 - }, - "3p_Adapter": "CTCTGCGTTGATACCACTGCTT" - }, - "named_random_segments": [ - "UMI", - "cDNA", - "sample_index" - ], - "coding_region": "cDNA", - "annotation_segments": { - "UMI": [ - [ - "ZU", - "XU" - ], - [ - "XM", - "XU" - ] - ], - "sample_index": [ - [ - "id", - "ip" - ] - ] - }, - "deprecated": false, - "name": "bulk_10x5p" -} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/bulk_teloprimeV2.json b/src/longbow/preconfigured_models/cdna/bulk_teloprimeV2.json deleted file mode 100644 index f3d02b47..00000000 --- a/src/longbow/preconfigured_models/cdna/bulk_teloprimeV2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "description": "Lexogen TeloPrime V2 kit", - "version": "3.0.0", - "structure": [ - "TPV2_adapter", - "cDNA", - "Poly_A", - "idx", - "rev_bind" - ], - "adapters": { - "TPV2_adapter": "CTACACGACGCTCTTCCGATCTTGGATTGATATGTAATACGACTCACTATAG", - "cDNA": "random", - "Poly_A": { - "HomopolymerRepeat": [ - "A", - 30 - ] - }, - "idx": { - "FixedLengthRandomBases": 10 - }, - "rev_bind": "CTCTGCGTTGATACCACTGCTT" - }, - "named_random_segments": [ - "idx", - "cDNA" - ], - "coding_region": "cDNA", - "annotation_segments": { - "idx": [ - [ - "BC", - "XB" - ] - ] - }, - "deprecated": false, - "name": "bulk_teloprimeV2" -} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/sc_10x3p.json b/src/longbow/preconfigured_models/cdna/sc_10x3p.json deleted file mode 100644 index 04f5105a..00000000 --- a/src/longbow/preconfigured_models/cdna/sc_10x3p.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "description": "single-cell 10x 3' kit", - "version": "3.0.0", - "structure": [ - "5p_Adapter", - "CBC", - "UMI", - "Poly_T", - "cDNA", - "3p_Adapter" - ], - "adapters": { - "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", - "CBC": { - "FixedLengthRandomBases": 16 - }, - "UMI": { - "FixedLengthRandomBases": 12 - }, - "Poly_T": { - "HomopolymerRepeat": [ - "T", - 30 - ] - }, - "cDNA": "random", - "3p_Adapter": "CCCATGTACTCTGCGTTGATACCACTGCTT" - }, - "named_random_segments": [ - "CBC", - "UMI", - "cDNA" - ], - "coding_region": "cDNA", - "annotation_segments": { - "UMI": [ - [ - "ZU", - "XU" - ], - [ - "XM", - "XU" - ] - ], - "CBC": [ - [ - "CR", - "XB" - ], - [ - "XC", - "XB" - ] - ] - }, - "deprecated": false, - "name": "sc_10x3p" -} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/sc_10x5p.json b/src/longbow/preconfigured_models/cdna/sc_10x5p.json deleted file mode 100644 index f21576a2..00000000 --- a/src/longbow/preconfigured_models/cdna/sc_10x5p.json +++ /dev/null @@ -1,61 +0,0 @@ -{ - "description": "single-cell 10x 5' kit", - "version": "3.0.0", - "structure": [ - "5p_Adapter", - "CBC", - "UMI", - "SLS", - "cDNA", - "Poly_A", - "3p_Adapter" - ], - "adapters": { - "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", - "CBC": { - "FixedLengthRandomBases": 16 - }, - "UMI": { - "FixedLengthRandomBases": 10 - }, - "SLS": "TTTCTTATATGGG", - "cDNA": "random", - "Poly_A": { - "HomopolymerRepeat": [ - "A", - 30 - ] - }, - "3p_Adapter": "GTACTCTGCGTTGATACCACTGCTT" - }, - "named_random_segments": [ - "CBC", - "UMI", - "cDNA" - ], - "coding_region": "cDNA", - "annotation_segments": { - "UMI": [ - [ - "ZU", - "XU" - ], - [ - "XM", - "XU" - ] - ], - "CBC": [ - [ - "CR", - "XB" - ], - [ - "XC", - "XB" - ] - ] - }, - "deprecated": false, - "name": "sc_10x5p" -} \ No newline at end of file diff --git a/src/longbow/preconfigured_models/cdna/spatial_slideseq.json b/src/longbow/preconfigured_models/cdna/spatial_slideseq.json deleted file mode 100644 index 3ace8e65..00000000 --- a/src/longbow/preconfigured_models/cdna/spatial_slideseq.json +++ /dev/null @@ -1,68 +0,0 @@ -{ - "description": "Slide-seq protocol", - "version": "3.0.0", - "structure": [ - "5p_Adapter", - "SBC2", - "SLS2", - "SBC1", - "UMI", - "Poly_T", - "cDNA", - "3p_Adapter" - ], - "adapters": { - "5p_Adapter": "TCTACACGACGCTCTTCCGATCT", - "SBC2": { - "FixedLengthRandomBases": 8 - }, - "SLS2": "TCTTCAGCGTTCCCGAGA", - "SBC1": { - "FixedLengthRandomBases": 6 - }, - "UMI": { - "FixedLengthRandomBases": 9 - }, - "Poly_T": { - "HomopolymerRepeat": [ - "T", - 30 - ] - }, - "cDNA": "random", - "3p_Adapter": "CCCATGTACTCTGCGTTGATACCACTGCTT" - }, - "named_random_segments": [ - "UMI", - "SBC2", - "SBC1", - "cDNA" - ], - "coding_region": "cDNA", - "annotation_segments": { - "UMI": [ - [ - "ZU", - "XU" - ], - [ - "XM", - "XU" - ] - ], - "SBC1": [ - [ - "X1", - "XP" - ] - ], - "SBC2": [ - [ - "X2", - "XR" - ] - ] - }, - "deprecated": false, - "name": "spatial_slideseq" -} \ No newline at end of file diff --git a/src/longbow/utils/model_utils.py b/src/longbow/utils/model_utils.py index c67b68f4..82480021 100644 --- a/src/longbow/utils/model_utils.py +++ b/src/longbow/utils/model_utils.py @@ -23,21 +23,19 @@ starts_with_number_re = re.compile(r"^\d") -def load_preconfigured_models(): - pre_configured_models = {"array": {}, "cdna": {}} +def load_models(): + models = {"array": {}, "cdna": {}} - with importlib.resources.path("longbow", "preconfigured_models") as model_dir: - for json_file in (model_dir / "array").glob("*json"): + with importlib.resources.path("longbow", "models") as model_dir: + for json_file in model_dir.glob("*json"): with json_file.open() as fh: m = json.load(fh) - pre_configured_models["array"][m["name"]] = m + if "array" in m: + models["array"][m["array"]["name"]] = m["array"] + if "cdna" in m: + models["cdna"][m["cdna"]["name"]] = m["cdna"] - for json_file in (model_dir / "cdna").glob("*json"): - with json_file.open() as fh: - m = json.load(fh) - pre_configured_models["cdna"][m["name"]] = m - - return pre_configured_models + return models class ModelBuilder: @@ -79,7 +77,7 @@ class ModelBuilder: SUDDEN_END_PROB = 0.01 MATCH_END_PROB = 0.1 - pre_configured_models = load_preconfigured_models() + pre_configured_models = load_models() @staticmethod def make_global_alignment_model(target, name=None):