Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #7159 Create tool for producing genomic regions (as a BED file) #8942

Merged
merged 49 commits into from
Aug 29, 2024
Merged
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
4436f85
Initial commit and basic code to read gtf
sanashah007 Jul 10, 2024
f4a84be
add: code to write to bed & integration test
sanashah007 Jul 12, 2024
8139a96
fix: make getAllFeatures public and use the nesting of features to ge…
sanashah007 Jul 15, 2024
c35c829
add: filtering transcripts by basic tag
sanashah007 Jul 17, 2024
0366e35
add: sorts by contig and start (need to fix - sorting lexicographically)
sanashah007 Jul 18, 2024
d2323ff
fix: now sorts by contig then start & output is correct
sanashah007 Jul 22, 2024
9b6e27a
fix: make dictionary an arg
sanashah007 Jul 22, 2024
bd5a019
add: comments + simplified CompareGtfInfo
sanashah007 Jul 22, 2024
0ef5498
refactor: apply method
sanashah007 Jul 24, 2024
cdbe336
refactor: onTraversalSuccess and writeToBed
sanashah007 Jul 24, 2024
924216b
add: more tests
sanashah007 Jul 25, 2024
2d80fa0
fix: test files in correct dir pt1. (files are too large)
sanashah007 Jul 25, 2024
db3a24e
fix: test files in correct dir pt2.
sanashah007 Jul 25, 2024
7dc019d
add: compareFiles and ground truth bed files
sanashah007 Jul 30, 2024
4b35136
fix: runGtfToBed assert
sanashah007 Jul 30, 2024
799fee5
add: comments to GtfToBed
sanashah007 Jul 30, 2024
f8be0b2
fix: error handling for different versions of gtf and dictionary
sanashah007 Jul 31, 2024
3d2570a
fix: edited some bad conventions
sanashah007 Jul 31, 2024
1dec98b
fix: remove spaces from input file fullName
sanashah007 Aug 1, 2024
3d976f9
add: gtf file with MYT1L and MAPK1
sanashah007 Aug 1, 2024
22a99af
add: many transcripts unit test and refactoring
sanashah007 Aug 2, 2024
e1835c8
add: tiebreaker sorting by id
sanashah007 Aug 5, 2024
2aae3ae
add: make sort by basic optional
sanashah007 Aug 5, 2024
87e10bc
add: html doc comment
sanashah007 Aug 5, 2024
32bb1dd
fix: dictionary arg
sanashah007 Aug 5, 2024
fecbec2
fix: add "Gencode" to description
sanashah007 Aug 7, 2024
f0fc352
add: sample mouse gencode testing
sanashah007 Aug 7, 2024
161f040
fix: Remove arg shortnames
sanashah007 Aug 9, 2024
705cea9
fix: rename and move CompareGtfInfo
sanashah007 Aug 9, 2024
920ed22
fix: kebab-case args
sanashah007 Aug 9, 2024
33dd8ea
fix: update html doc
sanashah007 Aug 9, 2024
e7ea45f
fix: use IntegrationTestSpec.assertEqualTextFiles()
sanashah007 Aug 9, 2024
df07f5b
fix: remove unnecessary test of pik3ca
sanashah007 Aug 9, 2024
865ffd4
fix: remove set functions in GtfInfo
sanashah007 Aug 9, 2024
9080b18
fix: style of comparator
sanashah007 Aug 9, 2024
8a6dcf6
fix: style of comparator
sanashah007 Aug 9, 2024
829b8f3
fix: use Files.newOutputStream() to write and logger for errors
sanashah007 Aug 13, 2024
6c8f3dc
fix: use getBestAvailableSequenceDictionary()
sanashah007 Aug 13, 2024
c6dade5
fix: use dataProvider for integration tests
sanashah007 Aug 13, 2024
050dc49
fix: better encapsulation
sanashah007 Aug 13, 2024
ebaf095
fix: move mapk1.gtf to large dir
sanashah007 Aug 22, 2024
bc72121
fix: arg names
sanashah007 Aug 22, 2024
0ee3b76
fix: rename reference dict.
sanashah007 Aug 22, 2024
3eee529
fix: sequence-dictionary arg javadoc
sanashah007 Aug 22, 2024
7a7a7d0
add: javadoc to GtfInfo
sanashah007 Aug 22, 2024
4c573e0
add: dictionary exception and corresponding test
sanashah007 Aug 26, 2024
d30ed84
add: test with fasta file as reference arg
sanashah007 Aug 26, 2024
15e308c
add: javadoc for fasta file
sanashah007 Aug 27, 2024
167d5fd
fix: javadoc and onTraversalStart exception
sanashah007 Aug 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor: apply method
test: add separate tests for gene and transcript
  • Loading branch information
sanashah007 committed Jul 24, 2024
commit 0ef54981849ce4d225fc03e74b609350c003a4cc
Original file line number Diff line number Diff line change
@@ -44,60 +44,82 @@ protected boolean isAcceptableFeatureType(Class<? extends Feature> featureType)
return featureType.isAssignableFrom(GencodeGtfFeature.class);
}

//Apply runs PER line
@Override
public void apply(GencodeGtfFeature feature, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
int geneStart = 0;
int geneEnd = 0;
List<GencodeGtfFeature> geneFeature = feature.getAllFeatures();

for (GencodeGtfFeature gtfFeature : geneFeature) {
List<GencodeGtfFeature.OptionalField<?>> optionalFields = null;
try {
optionalFields = gtfFeature.getOptionalField("tag");
} catch (Exception e) {
System.err.println("Could not retrieve optional fields" + e.getMessage());
//continue; //TODO: do i need continue? or return? or nothing?
}
try {
if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.GENE) { //if the feature type is gene
geneStart = gtfFeature.getStart(); //get gene start
geneEnd = gtfFeature.getEnd(); //get gene end
Interval interval = new Interval(gtfFeature.getContig(), geneStart, geneEnd); //create interval: contig (chromosome), start of gene, end of gene
GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, gtfFeature.getGeneName()); //create a new gtfInfo object: the interval, type as gene, and name of gene
idToInfo.put(gtfFeature.getGeneId(), gtfInfo); //create a new entry in the hashmap: key = geneId value = gtfInfo
}
for (GencodeGtfFeature.OptionalField<?> field : optionalFields) {
if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.TRANSCRIPT && field.getValue().equals("basic")) { //if the feature type is transcript and has the "basic" tag
Interval interval = new Interval(gtfFeature.getContig(), gtfFeature.getStart(), gtfFeature.getEnd()); //create interval: contig, start of transcript, end of transcript
GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.TRANSCRIPT, gtfFeature.getGeneName()); //create a new gtfInfo object: the interval, type as trnascript, and name of gene
idToInfo.put(gtfFeature.getTranscriptId(), gtfInfo); //create a new entry in hashmap: key = transcriptId value = gtfInfo
if (gtfFeature.getStart() < idToInfo.get(gtfFeature.getGeneId()).getStart()) { //if transcript start is less than it's corresponding gene's start
geneStart = gtfFeature.getStart(); //set the gene start to the transcript start
Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd); //create new interval with this update
GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName()); //create new gtfInfo with this update
idToInfo.put(gtfFeature.getGeneId(), gtfGeneInfo); //update the entry of the gene in hashmap to now have correct start value
}

if (gtfFeature.getEnd() > idToInfo.get(gtfFeature.getGeneId()).getEnd()) { //if transcript end is greater than it's corresponding gene's end
geneEnd = gtfFeature.getEnd(); //set the gene end to the transcript end
Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd); //create new interval with this update
GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName()); //create new gtfInfo with this update
idToInfo.put(gtfFeature.getGeneId(), gtfGeneInfo); //update the entry of the gene in hashmap to now have correct end value
}
}
}
List<GencodeGtfFeature> geneFeatures = feature.getAllFeatures();

for (GencodeGtfFeature gtfFeature : geneFeatures) {
processFeature(gtfFeature);
sanashah007 marked this conversation as resolved.
Show resolved Hide resolved
}
}

} catch (Exception e) {
System.err.println("Error processing feature: " + e.getMessage());
private void processFeature(GencodeGtfFeature gtfFeature) {
List<GencodeGtfFeature.OptionalField<?>> optionalFields = getOptionalFields(gtfFeature);

if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.GENE) {
processGeneFeature(gtfFeature);
}

for (GencodeGtfFeature.OptionalField<?> field : optionalFields) {
if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.TRANSCRIPT && "basic".equals(field.getValue())) {
processTranscriptFeature(gtfFeature);
}
}
}

private List<GencodeGtfFeature.OptionalField<?>> getOptionalFields(GencodeGtfFeature gtfFeature) {
List<GencodeGtfFeature.OptionalField<?>> optionalFields = null;
try {
optionalFields = gtfFeature.getOptionalField("tag");
} catch (Exception e) {
System.err.println("Could not retrieve optional fields: " + e.getMessage());
sanashah007 marked this conversation as resolved.
Show resolved Hide resolved
}
return optionalFields;
}

private void processGeneFeature(GencodeGtfFeature gtfFeature) {
int geneStart = gtfFeature.getStart();
int geneEnd = gtfFeature.getEnd();
Interval interval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);
GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, gtfFeature.getGeneName());
idToInfo.put(gtfFeature.getGeneId(), gtfInfo);
}

private void processTranscriptFeature(GencodeGtfFeature gtfFeature) {
Interval interval = new Interval(gtfFeature.getContig(), gtfFeature.getStart(), gtfFeature.getEnd());
GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.TRANSCRIPT, gtfFeature.getGeneName());
idToInfo.put(gtfFeature.getTranscriptId(), gtfInfo);
updateGeneStartIfNeeded(gtfFeature);
updateGeneEndIfNeeded(gtfFeature);
}

private void updateGeneStartIfNeeded(GencodeGtfFeature gtfFeature) {
int geneStart = idToInfo.get(gtfFeature.getGeneId()).getStart();
if (gtfFeature.getStart() < geneStart) {
geneStart = gtfFeature.getStart();
updateGeneInterval(gtfFeature, geneStart, idToInfo.get(gtfFeature.getGeneId()).getEnd());
}
}

private void updateGeneEndIfNeeded(GencodeGtfFeature gtfFeature) {
int geneEnd = idToInfo.get(gtfFeature.getGeneId()).getEnd();
if (gtfFeature.getEnd() > geneEnd) {
geneEnd = gtfFeature.getEnd();
updateGeneInterval(gtfFeature, idToInfo.get(gtfFeature.getGeneId()).getStart(), geneEnd);
}
}

private void updateGeneInterval(GencodeGtfFeature gtfFeature, int geneStart, int geneEnd) {
Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);
GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName());
idToInfo.put(gtfFeature.getGeneId(), gtfGeneInfo);
}



@Override
public Object onTraversalSuccess() { //after going through every line of the gtf
SamReader reader = SamReaderFactory.makeDefault().open(new File(String.valueOf(dictionaryPath)));
SamReader reader = SamReaderFactory.makeDefault().open(new File(String.valueOf(dictionaryPath))); //TODO: what is makeDefault() doing
SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); //get the sequence dictionary

List<Map.Entry<String, GtfInfo>> entries = new ArrayList<>(idToInfo.entrySet()); //convert the hashmap (populate in apply()) to a list
@@ -116,7 +138,7 @@ public Object onTraversalSuccess() { //after going through every line of the gtf
}


public void writeToBed(GtfInfo.Type type, Map<String, GtfInfo> sortedMap) {
private void writeToBed(GtfInfo.Type type, Map<String, GtfInfo> sortedMap) {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(String.valueOf(outputFile)))) { //write to bed file
for (Map.Entry<String, GtfInfo> entry : sortedMap.entrySet()) { //for each entry in the sorted map
if (entry.getValue().getType() == type) {
@@ -136,6 +158,7 @@ public void writeToBed(GtfInfo.Type type, Map<String, GtfInfo> sortedMap) {
}
}


@Override
public GATKPath getDrivingFeaturePath() {
return inputFile;
Original file line number Diff line number Diff line change
@@ -2,21 +2,55 @@

import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;

public class GtfToBedIntegrationTest extends CommandLineProgramTest {
private static final File input = new File("/Users/shahsana/TestGatk/gencode.v38.chr_patch_hapl_scaff.annotation.gtf");
private static final File dictionary = new File("/Users/shahsana/TestGatk/reference.dict");

@Test
public void testGene() {
runGtfToBed(false);
}

@Test
public void testTranscript() {
runGtfToBed(true);
}

@Test
public void testOneGene(){

}

@Test
public void testOneTranscript(){

}

@Test
public void testDecoyGenes(){

}

@Test
public void testDoesNotCrash() {
public void testDecoyTranscripts(){

}


private void runGtfToBed(boolean transcript){
File outputFile = new File("/Users/shahsana/TestGatk/output.bed");
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
.add("G", input)
.add("D", dictionary)
.add(GtfToBed.SORT_BY_TRANSCRIPT_LONG_NAME, true)
.add(GtfToBed.SORT_BY_TRANSCRIPT_LONG_NAME, transcript)
.addOutput(outputFile);
runCommandLine(argsBuilder);
Assert.assertEquals(1,1);
}

}