config.ini

[preprocess]

# For raptor
#in_tensor_dir : ./data/train-ready/pred-full/

# For comet
#in_tensor_dir : /scratch/mtari008/37154933/pred-full-deepnovo/

# For expanse
in_tensor_dir : /disk/raptor-2/mtari008/data/deepsnap/train-ready/nist_massiv_80k_no_ch_graymass-semi/

############ INPUT PARAMETERS ############
[input]

# file paths. Ignore the three path parameters below.
; msp_files : /oasis/projects/nsf/wmu101/mtari008/DeepSNAP/data/msp
mgf_files : sample_data/mgfs
db_peps_path : /expanse/lustre/projects/wmu101/mtari008/DeepSNAP/data/db_peps/db-peps.pkl

spec_size : 80000 # The array size to store a spectrum.

charge : 8 # Max charge value to be used for training.

use_mods : False # Whether to use modifications or not (both training and database search).

num_mods: 5 # Max mods per peptide

num_species : 9 # Number of species the training dataset contains. Deprecated. will not have any effect.

master_port : 12345 # if you get an error that port is already in use change this value to anothe number.

############ DATABASE SEARCH PARAMETERS ############
[search]

# This is model that will be loaded during search. It will be loaded from the models directory.
# 22 at the end is the epoch number. That's how the models are saved.
model_name : model_weights/specollate_model.pt

# absolute directory path with mgf file to be searched. files must have .mgf extension.
mgf_dir : sample_data/mgfs

# path where preprocessed mgf spectra from the above directory will be placed.
prep_dir : sample_data/preprocessed

# directory path containing peptide file obtained from OpenMS Digestor tool.
pep_dir : sample_data/peptides

# directory path where percolator input files will be placed.
# Use crux percolator tool to analyze these files.
out_pin_dir : output

# Batch sizes for forward pass through the network. 
# These sizes have been tested for 12 GBs of GPU memory.
spec_batch_size : 16384
pep_batch_size : 4096

# Batch size for database search. 1024 seems to work better.
search_spec_batch_size : 1024

precursor_tolerance : 20 # Precursor tolerance to use during database search (Da or ppm)
precursor_tolerance_type : ppm # either ppm or Da

keep_psms : 5 # Number of top scoring psms to keep

# Number of modified peptides to be generated to search against. 
# Different than the one in input section
num_mods : 1

# charge filter for input spectra.
# Note that spectra with all charges will be searched against charge independent peptide embeddings.
charge: 8

############ MACHINE LEARNING PARAMETERS ############
[ml]

# model will be stored by the this name in the /models directory.
model_name : 512-embed-2-lstm-SnapLoss2D-80k-nist-massive-gmc-semi-r2

batch_size : 1024

test_size : 0.2

pep_seq_len : 64

train_count : 0

snp_weight : 1

ce_weight : 0.001

mse_weight : 0.00001

dropout : 0.3

lr : 0.0001

weight_decay : 0.0001

epochs : 200

margin : 0.2

read_split_listing : False

############ DEFAULT VALUES ############
# DO NOT CHANGE
[default]
msp_file : /data/human_consensus_final_true_lib.msp
mgf_files : /data/
spec_size : 8000
charge : 2
use_mods : False
batch_size : 1024