Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle edge case where no putative novel ALEs found #51

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions scripts/filter_tx_by_three_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,7 @@ def main(gtf_path,
# eprint(le_combined_pass.columns)

pass_ids = set(le_combined_pass.as_df()[le_id_col])
eprint(f"Number of events passing database/PAS motif filters - {len(pass_ids)}")

#6. Generate 'match_stats' dataframe plus summary counts dfs
# Subset to df of Tx_id, atlas_filter, motif_filter, atlas_distance & motifs_found
Expand All @@ -736,11 +737,16 @@ def main(gtf_path,
"event_type"
]

fail_match_stats = (le.subset(lambda df: ~df[le_id_col].isin(pass_ids))
.as_df()
[ms_cols]
.drop_duplicates(subset=["transcript_id"])
)
fail_match_stats = le.subset(lambda df: ~df[le_id_col].isin(pass_ids))
# Subset to minimal output columns
if len(fail_match_stats) > 0:
fail_match_stats = (fail_match_stats
.as_df()
[ms_cols]
.drop_duplicates(subset=["transcript_id"])
)
else:
fail_match_stats = pd.DataFrame(columns=ms_cols)

pass_match_stats = le_combined_pass.as_df()[ms_cols]

Expand Down Expand Up @@ -906,7 +912,7 @@ def main(gtf_path,
pas_motifs = gruber_pas_motifs

elif args.motifs.capitalize() == "Beaudoing":
pas_motifs = Beaudoing_pas_motifs
pas_motifs = beaudoing_pas_motifs

elif os.path.exists(args.motifs):
eprint(f"reading in pas motifs from file - {args.motifs}")
Expand Down
104 changes: 62 additions & 42 deletions scripts/get_novel_last_exons.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,9 +449,21 @@ def find_extension_events(novel_le,
suffix
)
)
post_tol_ids = set(novel_le_ext.as_df()[id_col])

try:
post_tol_ids = set(novel_le_ext.as_df()[id_col])
except KeyError:
# empty df/no extensions found
post_tol_ids = set()

eprint(f"After 5'end match tolerance filter, number of events - {len(post_tol_ids)}")
if len(post_tol_ids) == 0:
# no extensions found, return empty pyranges
if return_filtered_ids:
# return tuple of empty gr & empty sets
return pr.PyRanges(), set(), set()
else:
return pr.PyRanges()

if return_filtered_ids:
end_5p_filt_ids = post_tol_ids - post_len_ids
Expand Down Expand Up @@ -523,6 +535,10 @@ def find_spliced_events(novel_li,
# Only novel last SJs overlapping ref SJs kept
novel_spliced = novel_li.join(ref_introns, strandedness="same", suffix=suffix)

if len(novel_spliced) == 0:
# no putative events found, return empty ranges
return pr.PyRanges()

eprint(f"Number of putative novel spliced events - {_n_ids(novel_spliced, id_col)}")
# eprint(f"ref exon ranks\n {novel_spliced.as_df()[rank_col].drop_duplicates()}")

Expand Down Expand Up @@ -777,47 +793,51 @@ def main(input_gtf_path,

eprint(f"Complete - took {end - start} s")

# Spliced is last introns, want corresponding last exons in output
spliced_le = novel_le.subset(lambda df: df["transcript_id"].isin(set(spliced.transcript_id)))

# Want to retain metadata from matching reference regions
# These coordinates/metadata correspond to matching overlapping ref last intron/SJ
spliced = spliced[["transcript_id",
"gene_id_ref",
"transcript_id_ref",
"gene_name", # comes from ref GTF, but not always gene_name in StringTie output ('ref_gene_name') will prefer matching from ref
"Start_ref",
"End_ref",
"event_type"]]

# Add the _ref suffix so it's obvious the col came from the reference GTF
spliced = spliced.apply(lambda df: df.rename(columns={"gene_name": "gene_name_ref"}))

spliced_cols = spliced.columns.tolist()

# eprint(spliced_le.columns)

spliced_le = spliced_le.apply_pair(spliced,
lambda df, df2:_pd_merge_gr(df,
df2.drop(columns=["Chromosome",
"Start",
"End",
"Strand"
]
),
how="left",
on="transcript_id",
suffixes=[None, "_spl"],
to_merge_cols=spliced_cols),
)
# eprint(spliced_le.columns)

# Since drop default cols from spliced, should have no cols with suffix
# spliced_le = spliced_le.drop(like="_spl$")

# eprint(spliced_le.columns)

combined = pr.concat([extensions, spliced_le])
if len(spliced) != 0:
# Spliced is last introns, want corresponding last exons in output
spliced_le = novel_le.subset(lambda df: df["transcript_id"].isin(set(spliced.transcript_id)))

# Want to retain metadata from matching reference regions
# These coordinates/metadata correspond to matching overlapping ref last intron/SJ
spliced = spliced[["transcript_id",
"gene_id_ref",
"transcript_id_ref",
"gene_name", # comes from ref GTF, but not always gene_name in StringTie output ('ref_gene_name') will prefer matching from ref
"Start_ref",
"End_ref",
"event_type"]]

# Add the _ref suffix so it's obvious the col came from the reference GTF
spliced = spliced.apply(lambda df: df.rename(columns={"gene_name": "gene_name_ref"}))

spliced_cols = spliced.columns.tolist()

# eprint(spliced_le.columns)

spliced_le = spliced_le.apply_pair(spliced,
lambda df, df2:_pd_merge_gr(df,
df2.drop(columns=["Chromosome",
"Start",
"End",
"Strand"
]
),
how="left",
on="transcript_id",
suffixes=[None, "_spl"],
to_merge_cols=spliced_cols),
)
# eprint(spliced_le.columns)

# Since drop default cols from spliced, should have no cols with suffix
# spliced_le = spliced_le.drop(like="_spl$")

# eprint(spliced_le.columns)

combined = pr.concat([extensions, spliced_le])

else:
combined = extensions

# Finally, collapse metadata/duplicate attribute values for each last exon (transcript ID)
# This can occur if same last exon matches to multiple reference transcripts/junctions
Expand Down