Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed support for newer Gencode GTF versions #8351

Merged
merged 5 commits into from
Jun 28, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
responded to comments and removed the AnonomyousOptionalField string …
…from GencodeGTFFeature (which had a bug with ordering before)
jamesemery committed Jun 27, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 125aa5ad92a73c7db170237030a027a70f3f9590
Original file line number Diff line number Diff line change
@@ -12,6 +12,48 @@
* be instantiated if necessary to preserve the spec compliance of funcotator against future releases of Gencode.
*/
public class GencodeGTFFieldConstants {

/**
* A list of required fields from the GencodeGTF codec. These are the fields that are currently first-class members of the
* GencodeGTFFeature and are handled specially handled by the GencodeGTFCodec.
*/
public static final String GENE_ID = "gene_id";
public static final String TRANSCRIPT_ID = "transcript_id";
public static final String GENE_TYPE = "gene_type";
public static final String GENE_BIOTYPE = "gene_biotype";
public static final String GENE_STATUS = "gene_status";
public static final String GENE_NAME = "gene_name";
public static final String TRANSCRIPT_TYPE = "transcript_type";
public static final String TRANSCRIPT_BIOTYPE = "transcript_biotype";
public static final String TRANSCRIPT_STATUS = "transcript_status";
public static final String TRANSCRIPT_NAME = "transcript_name";
public static final String LEVEL = "level";

// Mandatory except in Gene and Transcript lines (not enforced by this codec)
public static final String EXON_NUMBER = "exon_number";
public static final String EXON_ID = "exon_id";

public class GencodeOptionalFields {
/**
* Spec optional gencode fields:
*/
public static final String TAG = "tag";
public static final String CCDSID = "ccdsid";
public static final String HAVANA_GENE = "havana_gene";
public static final String HAVANA_TRANSCRIPT = "havana_transcript";
public static final String PROTEIN_ID = "protein_id";
public static final String ONT = "ont";
public static final String TRANSCRIPT_SUPPORT_LEVEL = "transcript_support_level";
public static final String REMAP_STATUS = "remap_status";
public static final String REMAP_ORIGINAL_ID = "remap_original_id";
public static final String REMAP_ORIGINAL_LOCATION = "remap_original_location";
public static final String REMAP_NUM_MAPPINGS = "remap_num_mappings";
public static final String REMAP_TARGET_STATUS = "remap_target_status";
public static final String REMAP_SUBSTITUTED_MISSING_TARGET = "remap_substituted_missing_target";
public static final String HGNC_ID = "hgnc_id";
public static final String MGI_ID = "mgi_id";
}

/**
* Biotype / transcript type for the transcript or gene represented in a feature.
* This is a tag of some biological function associated with a feature.
Original file line number Diff line number Diff line change
@@ -69,7 +69,7 @@ final public class GencodeGtfCodec extends AbstractGtfCodec {
/**
* Maximum version of gencode that will not generate a warning. This parser will still attempt to parse versions above this number, but a warning about potential errors will appear.
*/
private static final int GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE = 43;
private static final int GENCODE_GTF_MAX_TESTED_VERSION = 43;

private int currentLineNum = 1;
private final List<String> header = new ArrayList<>();
@@ -315,8 +315,8 @@ boolean validateHeader(final List<String> header, final boolean throwIfInvalid)
}
}

if (versionNumber > GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE) {
logger.warn("GENCODE GTF Header line 1 has a version number that is above maximum tested version (v " + GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE + ") (given: " +
if (versionNumber > GENCODE_GTF_MAX_TESTED_VERSION) {
logger.warn("GENCODE GTF Header line 1 has a version number that is above maximum tested version (v " + GENCODE_GTF_MAX_TESTED_VERSION + ") (given: " +
versionNumber + "): " + header.get(0) + " Continuing, but errors may occur.");
}
}
Original file line number Diff line number Diff line change
@@ -148,100 +148,53 @@ protected GencodeGtfFeature(final String[] gtfFields, final String gtfFileType)

switch (fieldName) {
// Find the right field to set:
case "gene_id":
case GencodeGTFFieldConstants.GENE_ID:
baseData.geneId = fieldValue;
break;
case "transcript_id":
case GencodeGTFFieldConstants.TRANSCRIPT_ID:
baseData.transcriptId = fieldValue;
break;
case "gene_type":
case GencodeGTFFieldConstants.GENE_TYPE:
baseData.geneType = fieldValue;
break;
// For ENSEMBL GTF files:
case "gene_biotype":
case GencodeGTFFieldConstants.GENE_BIOTYPE:
baseData.geneType = fieldValue;
break;
case "gene_status":
case GencodeGTFFieldConstants.GENE_STATUS:
baseData.geneStatus = fieldValue;
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
break;
case "gene_name":
case GencodeGTFFieldConstants.GENE_NAME:
baseData.geneName = fieldValue;
break;
case "transcript_type":
case GencodeGTFFieldConstants.TRANSCRIPT_TYPE:
baseData.transcriptType = fieldValue;
break;
case "transcript_biotype":
case GencodeGTFFieldConstants.TRANSCRIPT_BIOTYPE:
baseData.transcriptType = fieldValue;
break;
case "transcript_status":
case GencodeGTFFieldConstants.TRANSCRIPT_STATUS:
baseData.transcriptStatus = fieldValue;
break;
case "transcript_name":
case GencodeGTFFieldConstants.TRANSCRIPT_NAME:
baseData.transcriptName = fieldValue;
break;
case "exon_number":
case GencodeGTFFieldConstants.EXON_NUMBER:
try {
baseData.exonNumber = Integer.valueOf(fieldValue);
}
catch (final NumberFormatException ex) {
throw new UserException.MalformedFile("Could not convert field value into integer: " + fieldValue);
}
break;
case "exon_id":
case GencodeGTFFieldConstants.EXON_ID:
baseData.exonId = fieldValue;
break;
case "level":
case GencodeGTFFieldConstants.LEVEL:
baseData.locusLevel = fieldValue;
break;
case "tag":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "ccdsid":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "havana_gene":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "havana_transcript":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "protein_id":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "ont":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "transcript_support_level":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "remap_status":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "remap_original_id":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "remap_original_location":
try {
optionalField = new OptionalField<>(fieldName, Long.valueOf(fieldValue));
}
catch (final NumberFormatException nfe) {
// We must have gotten a field that has a different format.
// For now, just copy it over:
optionalField = new OptionalField<>(fieldName, fieldValue);
}
break;
case "remap_num_mappings":
optionalField = new OptionalField<>(fieldName, Long.valueOf(fieldValue));
break;
case "remap_target_status":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
case "remap_substituted_missing_target":
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
default:
anonymousOptionalFieldBuilder.append(extraField);
anonymousOptionalFieldBuilder.append(EXTRA_FIELD_DELIMITER);
optionalField = new OptionalField<>(fieldName, fieldValue);
break;
}

@@ -250,11 +203,6 @@ protected GencodeGtfFeature(final String[] gtfFields, final String gtfFileType)
baseData.optionalFields.add(optionalField);
}
}

// Save our anonymous optional fields:
if ( anonymousOptionalFieldBuilder.length() != 0 ) {
baseData.anonymousOptionalFields = anonymousOptionalFieldBuilder.toString();
}
}

/**
@@ -458,10 +406,6 @@ private String serializeToStringHelper() {
baseData.optionalFields.stream().map(Object::toString).collect(Collectors.joining(" "))
);

if ( baseData.anonymousOptionalFields != null ) {
stringBuilder.append(baseData.anonymousOptionalFields);
}

return stringBuilder.toString().trim();
}

@@ -581,11 +525,7 @@ public List<OptionalField<?>> getOptionalFields() {
return baseData.optionalFields;
}

public String getAnonymousOptionalFields() {
return baseData.anonymousOptionalFields;
}

// Certian optional fields support multiple values, so we need to return a list of them.
// Certian optional fields support multiple values (for example "tag"), so we need to return a list of them.
public List<OptionalField<?>> getOptionalField(final String key) {
final List<OptionalField<?>> optionalFields = new ArrayList<>();
for (final OptionalField<?> optionalField : baseData.optionalFields) {
Original file line number Diff line number Diff line change
@@ -83,17 +83,13 @@ final public class GencodeGtfFeatureBaseData {

/**
* Optional GENCODE GTF Fields.
* This also includes unidentified GTF fields that may not be specified in the Gencode GTF specification.
* For details, see the following:
* https://www.gencodegenes.org/data_format.html
* https://www.gencodegenes.org/gencode_tags.html
*/
public List<GencodeGtfFeature.OptionalField<?>> optionalFields = new ArrayList<>();

/**
* Additional optional GTF fields.
*/
public String anonymousOptionalFields = null;

public GencodeGtfFeatureBaseData() {}

public GencodeGtfFeatureBaseData(
@@ -117,8 +113,7 @@ public GencodeGtfFeatureBaseData(
final int exonNumber,
final String exonId,
final String locusLevel,
final List<GencodeGtfFeature.OptionalField<?>> optionalFields,
final String anonymousOptionalFields
final List<GencodeGtfFeature.OptionalField<?>> optionalFields
) {
this.gtfSourceFileType = gtfSourceFileType;
this.featureOrderNumber = featureOrderNumber;
@@ -145,7 +140,6 @@ public GencodeGtfFeatureBaseData(
this.optionalFields = optionalFields;
}

this.anonymousOptionalFields = anonymousOptionalFields;
}

@Override
@@ -183,8 +177,7 @@ else if ( this == that ) {
Objects.equals(exonNumber, thatBaseData.exonNumber) &&
Objects.equals(exonId, thatBaseData.exonId) &&
Objects.equals(locusLevel, thatBaseData.locusLevel) &&
Objects.equals(anonymousOptionalFields, thatBaseData.anonymousOptionalFields) &&
Objects.equals(optionalFields, thatBaseData.optionalFields);
Objects.equals(optionalFields, thatBaseData.optionalFields); //TODO should equality enforce the order of the optional fields?
}

return isEqual;
@@ -211,7 +204,6 @@ public int hashCode() {
result = 31 * result + (exonId != null ? exonId.hashCode() : 0);
result = 31 * result + (locusLevel != null ? locusLevel.hashCode() : 0);
result = 31 * result + (optionalFields != null ? optionalFields.hashCode() : 0);
result = 31 * result + (anonymousOptionalFields != null ? anonymousOptionalFields.hashCode() : 0);
return result;
}
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -280,8 +280,8 @@ private GencodeGtfFeatureBaseData createGtfBaseDataForTestIs5Prime(final SimpleI
1,
"",
GencodeGTFFieldConstants.LocusLevel.AUTOMATICALLY_ANNOTATED.toString(),
null,
"");
null
);
}

private GencodeGtfExonFeature helpCreateExonFeature(final SimpleInterval interval,
@@ -701,16 +701,15 @@ Object[][] provideForCreateDefaultFuncotationsOnProblemVariant() {

data = new GencodeGtfFeatureBaseData(GencodeGtfCodec.GTF_FILE_TYPE_STRING, 1, variantInterval.getContig(), GencodeGtfFeature.ANNOTATION_SOURCE_ENSEMBL, GencodeGtfFeature.FeatureType.GENE,
variantInterval.getStart()-2000, variantInterval.getEnd()+2000, Strand.POSITIVE, GencodeGtfFeature.GenomicPhase.DOT, "TEST_GENE1", null, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(),
null, "TEST_GENE", null, null, null, -1, null, GencodeGTFFieldConstants.LocusLevel.AUTOMATICALLY_ANNOTATED.toString(), null, null);
null, "TEST_GENE", null, null, null, -1, null, GencodeGTFFieldConstants.LocusLevel.AUTOMATICALLY_ANNOTATED.toString(), null);
final GencodeGtfGeneFeature gene = (GencodeGtfGeneFeature)GencodeGtfFeature.create(data);

// ======================

data = new GencodeGtfFeatureBaseData(GencodeGtfCodec.GTF_FILE_TYPE_STRING, 2, variantInterval.getContig(), GencodeGtfFeature.ANNOTATION_SOURCE_ENSEMBL, GencodeGtfFeature.FeatureType.TRANSCRIPT,
variantInterval.getStart()-1000, variantInterval.getEnd()+1000, Strand.POSITIVE, GencodeGtfFeature.GenomicPhase.DOT, "TEST_GENE1", "TEST_TRANSCRIPT1", GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(),
null, "TEST_GENE", GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(), null, "TEST_TRANSCRIPT1", -1, null, GencodeGTFFieldConstants.LocusLevel.AUTOMATICALLY_ANNOTATED.toString(),
Collections.emptyList(),
null
Collections.emptyList()
);
final GencodeGtfTranscriptFeature transcript1 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(data);
gene.addTranscript(transcript1);
Original file line number Diff line number Diff line change
@@ -11,6 +11,8 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
* Test class for the ENSEMBL GTF Reader.
@@ -30,8 +32,12 @@ private GencodeGtfFeature createEcoliEnsemblGene(final int startingFeatureOrder,
final String exonId, final int cdsStart, final int cdsEnd) {

// Placeholder for constant extra data:
final String geneAnonymousOptionalFields = " gene_source \"ena\";";
final String transcriptAnonymousOptionalFields = geneAnonymousOptionalFields + " transcript_source \"ena\";";
final List<GencodeGtfFeature.OptionalField<?>> geneAnonymousOptionalFields = Arrays.asList( new GencodeGtfFeature.OptionalField<>("gene_source", "ena"));
final List<GencodeGtfFeature.OptionalField<?>> transcriptAnonymousOptionalFields = Arrays.asList(
new GencodeGtfFeature.OptionalField<>("gene_source", "ena"),
new GencodeGtfFeature.OptionalField<>("transcript_source", "ena"));
//final String geneAnonymousOptionalFields = " gene_source \"ena\";";
//final String transcriptAnonymousOptionalFields = geneAnonymousOptionalFields + " transcript_source \"ena\";";

int featureOrderNum = startingFeatureOrder;

@@ -40,15 +46,13 @@ private GencodeGtfFeature createEcoliEnsemblGene(final int startingFeatureOrder,
data = new GencodeGtfFeatureBaseData(EnsemblGtfCodec.GTF_FILE_TYPE_STRING, featureOrderNum++, ECOLI_CONTIG_NAME, GencodeGtfFeature.ANNOTATION_SOURCE_ENA, GencodeGtfFeature.FeatureType.GENE,
geneStart, geneEnd, Strand.POSITIVE, GencodeGtfFeature.GenomicPhase.DOT, geneId, null, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(),
null, geneName, null, null, null, -1, null, null,
null,
geneAnonymousOptionalFields);
final GencodeGtfGeneFeature gene = (GencodeGtfGeneFeature)GencodeGtfFeature.create(data);
gene.setUcscGenomeVersion(ECOLI_UCSC_GENOME_VERSION);

data = new GencodeGtfFeatureBaseData(EnsemblGtfCodec.GTF_FILE_TYPE_STRING, featureOrderNum++, ECOLI_CONTIG_NAME, GencodeGtfFeature.ANNOTATION_SOURCE_ENA, GencodeGtfFeature.FeatureType.TRANSCRIPT,
geneStart, geneEnd, Strand.POSITIVE, GencodeGtfFeature.GenomicPhase.DOT, geneId, transcriptId, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(),
null, geneName, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(), null, transcriptName, -1, null, null,
null,
transcriptAnonymousOptionalFields
);
final GencodeGtfTranscriptFeature transcript = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(data);
@@ -57,7 +61,6 @@ private GencodeGtfFeature createEcoliEnsemblGene(final int startingFeatureOrder,
data = new GencodeGtfFeatureBaseData(EnsemblGtfCodec.GTF_FILE_TYPE_STRING, featureOrderNum++, ECOLI_CONTIG_NAME, GencodeGtfFeature.ANNOTATION_SOURCE_ENA, GencodeGtfFeature.FeatureType.EXON,
geneStart, geneEnd, Strand.POSITIVE, GencodeGtfFeature.GenomicPhase.DOT, geneId, transcriptId, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(),
null, geneName, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(), null, transcriptName, 1, exonId, null,
null,
transcriptAnonymousOptionalFields
);
final GencodeGtfExonFeature exon = (GencodeGtfExonFeature) GencodeGtfFeature.create(data);
@@ -66,16 +69,17 @@ private GencodeGtfFeature createEcoliEnsemblGene(final int startingFeatureOrder,
data = new GencodeGtfFeatureBaseData(EnsemblGtfCodec.GTF_FILE_TYPE_STRING, featureOrderNum++, ECOLI_CONTIG_NAME, GencodeGtfFeature.ANNOTATION_SOURCE_ENA, GencodeGtfFeature.FeatureType.CDS,
cdsStart, cdsEnd, Strand.POSITIVE, GencodeGtfFeature.GenomicPhase.ZERO, geneId, transcriptId, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(),
null, geneName, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(), null, transcriptName, 1, null, null,
Collections.singletonList(new GencodeGtfFeature.OptionalField<>("protein_id", transcriptId)),
transcriptAnonymousOptionalFields
Stream.concat(
transcriptAnonymousOptionalFields.stream(),
Stream.of(new GencodeGtfFeature.OptionalField<>("protein_id", transcriptId))
).collect(Collectors.toList())
);
final GencodeGtfCDSFeature cds = (GencodeGtfCDSFeature) GencodeGtfFeature.create(data);
cds.setUcscGenomeVersion(ECOLI_UCSC_GENOME_VERSION);

data = new GencodeGtfFeatureBaseData(EnsemblGtfCodec.GTF_FILE_TYPE_STRING, featureOrderNum++, ECOLI_CONTIG_NAME, GencodeGtfFeature.ANNOTATION_SOURCE_ENA, GencodeGtfFeature.FeatureType.START_CODON,
cdsStart, cdsStart+2, Strand.POSITIVE, GencodeGtfFeature.GenomicPhase.ZERO, geneId, transcriptId, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(),
null, geneName, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(), null, transcriptName, 1, null, null,
null,
transcriptAnonymousOptionalFields
);
final GencodeGtfStartCodonFeature startCodon = (GencodeGtfStartCodonFeature) GencodeGtfFeature.create(data);
@@ -84,7 +88,6 @@ private GencodeGtfFeature createEcoliEnsemblGene(final int startingFeatureOrder,
data = new GencodeGtfFeatureBaseData(EnsemblGtfCodec.GTF_FILE_TYPE_STRING, featureOrderNum, ECOLI_CONTIG_NAME, GencodeGtfFeature.ANNOTATION_SOURCE_ENA, GencodeGtfFeature.FeatureType.STOP_CODON,
cdsEnd+1, cdsEnd+3, Strand.POSITIVE, GencodeGtfFeature.GenomicPhase.ZERO, geneId, transcriptId, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(),
null, geneName, GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString(), null, transcriptName, 1, null, null,
null,
transcriptAnonymousOptionalFields
);
final GencodeGtfStopCodonFeature stopCodon = (GencodeGtfStopCodonFeature) GencodeGtfFeature.create(data);

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -299,8 +299,8 @@ public static GencodeGtfTranscriptFeature createArtificialGencodeGtfTranscriptFe
-1,
null,
GencodeGTFFieldConstants.LocusLevel.AUTOMATICALLY_ANNOTATED.toString(),
Collections.emptyList(),
null)
Collections.emptyList()
)
);
}