Skip to content

Commit

Permalink
rna_align_coverage.py
Browse files Browse the repository at this point in the history
  • Loading branch information
mmagnus committed Dec 13, 2024
1 parent 7a64810 commit 9f34167
Show file tree
Hide file tree
Showing 4 changed files with 1,035 additions and 1 deletion.
72 changes: 72 additions & 0 deletions rna_tools/tools/rna_alignment/rna_align_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
"""
from Bio import AlignIO
import argparse
from icecream import ic
import sys
ic.configureOutput(outputFunction=lambda *a: print(*a, file=sys.stderr))
ic.configureOutput(prefix='> ')


def get_parser():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-v", "--verbose",
action="store_true", help="be verbose")
parser.add_argument("sequence", help="", default="") # nargs='+')
parser.add_argument("stk", help="", default="") # nargs='+')
return parser


if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()

verbose = args.verbose

# Load the alignment
alignment = AlignIO.read(args.stk, "stockholm")
alignment_name = args.stk.split('/')[-1]
# Define the target sequence ID
target_id = args.sequence


# Find the target sequence
target_seq = None
for record in alignment:
if record.id == target_id:
target_seq = record.seq
break

if target_seq is None:
print(f"Target sequence '{target_id}' not found in the alignment.")
else:
seq_len = 0
cols_no_coverage = 0
# Collect columns where the target sequence has no gaps
filtered_columns = []
for col_index in range(alignment.get_alignment_length()):
if target_seq[col_index] != "-":
column = [record.seq[col_index] for record in alignment]
filtered_columns.append(column)
seq_len += 1

# Display filtered columns
if verbose: print(f"Columns where the target sequence '{target_id}' has no gaps:")
for col in filtered_columns:
if verbose: print(col, len(col))
# Count the number of '-'
gap_count = col.count('-')
if verbose: print(f"Number of '-': {gap_count}")
ratio_per_col = gap_count/(len(col) - 1)
if verbose: print(f'ratio of gaps: {ratio_per_col}') # -1 target_sequence
if ratio_per_col > 0.5:
if verbose: print('WARNING: more than 50% gaps in column', cols_no_coverage)
cols_no_coverage +=1
if verbose: print()

ratio = round((1 - (cols_no_coverage / seq_len)) * 100)
print(f"Query {target_id} length {seq_len} in the alignment {alignment_name} number of columns with low coverage (<50%) for the query sequence {cols_no_coverage}; coverge for the query: {ratio}%")
10 changes: 9 additions & 1 deletion rna_tools/tools/rna_alignment/test.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
python rna_align_foldability.py test_data/gmp_ref.sto test_data/gmp_foldability.csv --verbose
#python rna_align_fetch_seed.py RF00026
#python rna_align_fetch_cm.py RF00026
#python rna_align_subcols.py test_data/u6_isl_rfam.sto
#python rna_align_foldability.py test_data/gmp_ref.sto test_data/gmp_foldability.csv --verbose

python rna_align_coverage.py xrRNA test_data/xrrna_rfam.sto
#easel alistat test_data/xrrna_rfam.sto
python rna_align_coverage.py u6isl test_data/u6_isl_rfam.sto

Loading

0 comments on commit 9f34167

Please sign in to comment.