Merge branch 'release_v0.3.9'

foerstner-lab · Mar 7, 2016 · 94d1ad9 · 94d1ad9
2 parents 479b397 + 3704017
commit 94d1ad9
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 *pyc
 rapl/doc/build/
+
+# Links for development only
+bin/reademptionlib
+
diff --git a/bin/reademption b/bin/reademption
@@ -137,13 +137,19 @@ def main():
         "base.")
     coverage_creation_parser.add_argument(
         "--coverage_style", "-b", choices=["global", "first_base_only",
-                                           "last_base_only"],
-        default="global", help="Choose for coverage "
-        "generation if only the first aligned base at the 5' end of each read "
-        "('first_base_only') or the last aligned base at the 3' end of each "
-        "read ('last_base_only') is taken into account. By default the "
-        "coverage is generated using the whole range of each alignment "
+                                           "last_base_only", "centered"],
+        default="global", help="Select for coverage generation if only the "
+        "first aligned base at the 5' end of each read ('first_base_only') or "
+        "the last aligned base at the 3' end of each read ('last_base_only') "
+        "is taken into account. The centered approach ('centered') clips a "
+        "predefined number of nts from each alignment end and adds to the "
+        "remaining genomic region a value divided by its length. By default "
+        "the coverage is generated using the whole range of each alignment "
         "('global').")
+    coverage_creation_parser.add_argument(
+        "--clip_length", "-cl", type=int, default=11, help="Number of "
+        "nucleotides that are clipped from each alignment end for centered "
+        "approach.")
     coverage_creation_parser.add_argument(
         "--check_for_existing_files", "-f", default=False,
         action="store_true", help="Check for existing files (e.g. from a "
@@ -164,13 +170,19 @@ def main():
         help="Minimal read-annotation-overlap (in nt) (default 1).")
     gene_wise_quanti_parser.add_argument(
         "--read_region", "-b", choices=["global", "first_base_only",
-                                           "last_base_only"],
-        default="global", help="Choose for gene-wise quantification "
+                                        "last_base_only", "centered"],
+        default="global", help="Select for gene-wise quantification "
         "if only the first aligned base at the 5' end of each read "
         "('first_base_only') or the last aligned base at the 3' end of each "
-        "read ('last_base_only') is taken into account. By default the "
-        "overlap is calculated based on the whole range of each alignment "
+        "read ('last_base_only') is taken into account. The centered approach "
+        "('centered') clips a predefined number of nts from each alignment end "
+        "and calculates the overlap based on the remaining region. By default "
+        "the overlap is calculated based on the whole range of each alignment "
         "('global').")
+    gene_wise_quanti_parser.add_argument(
+        "--clip_length", "-cl", type=int, default=11, help="Number of "
+        "nucleotides that are clipped from each alignment end for centered "
+        "approach.")
     gene_wise_quanti_parser.add_argument(
         "--no_count_split_by_alignment_no", "-n", default=False,
         action="store_true", help="Do not split read countings by the number "

diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py
@@ -535,8 +535,9 @@ def _create_coverage_files_for_lib(
             read_count_splitting = False
         coverage_calculator = CoverageCalculator(
             read_count_splitting=read_count_splitting,
-            uniqueley_aligned_only=self._args.unique_only,
+            uniquely_aligned_only=self._args.unique_only,
             coverage_style=self._args.coverage_style,
+            clip_length=self._args.clip_length,
             non_strand_specific=self._args.non_strand_specific)
         (coverage_writers_raw, coverage_writers_tnoar_min_norm,
          coverage_writers_tnoar_mil_norm) = self._wiggle_writers(
@@ -649,6 +650,7 @@ def _quantify_gene_wise(
         gene_wise_quantification = GeneWiseQuantification(
             min_overlap=self._args.min_overlap,
             read_region=self._args.read_region,
+            clip_length=self._args.clip_length,
             norm_by_alignment_freq=norm_by_alignment_freq,
             norm_by_overlap_freq=norm_by_overlap_freq,
             allowed_features_str=self._args.allowed_features,

diff --git a/reademptionlib/coveragecalculator.py b/reademptionlib/coveragecalculator.py
@@ -1,13 +1,16 @@
+import numpy as np
 import pysam
 
 
 class CoverageCalculator(object):
 
-    def __init__(self, read_count_splitting=True, uniqueley_aligned_only=False,
-                 coverage_style="global", non_strand_specific=False):
+    def __init__(self, read_count_splitting=True, uniquely_aligned_only=False,
+                 coverage_style="global", clip_length=11,
+                 non_strand_specific=False):
         self._read_count_splitting = read_count_splitting
-        self._uniqueley_aligned_only = uniqueley_aligned_only
+        self._uniquely_aligned_only = uniquely_aligned_only
         self._coverage_style = coverage_style
+        self._clip_length = clip_length
         self._coverage_add_function = self._select_coverage_add_function()
         self._coverages = {}
         self._non_strand_specific = non_strand_specific
@@ -30,12 +33,12 @@ def _sum_strand_coverages(self):
 
     def _init_coverage_list(self, length):
         for strand in ["forward", "reverse"]:
-            self._coverages[strand] = [0.0] * length
+            self._coverages[strand] = np.array([0.0] * length)
 
     def _calc_coverage(self, ref_seq, bam):
         for entry in bam.fetch(ref_seq):
             number_of_hits = dict(entry.tags)["NH"]
-            if self._uniqueley_aligned_only is True and number_of_hits != 1:
+            if self._uniquely_aligned_only is True and number_of_hits != 1:
                 continue
             # Note: No translation from SAMParsers coordinates to python
             # list coorindates is needed.
@@ -54,6 +57,8 @@ def _select_coverage_add_function(self):
             return self._add_first_base_coverage
         elif self._coverage_style == "last_base_only":
             return self._add_last_base_coverage
+        elif self._coverage_style == "centered":
+            return self._add_centered_coverage
         else:
             return self._add_whole_alignment_coverage
 
@@ -63,28 +68,35 @@ def _open_bam_file(self, bam_file):
     def _add_whole_alignment_coverage(self, entry, increment, start, end):
         if ((entry.is_reverse is False and entry.is_read2 is False) or
                 (entry.is_reverse is True and entry.is_read2 is True)):
-            self._coverages["forward"][start:end] = [
-                coverage + increment for coverage in
-                self._coverages["forward"][start:end]]
+            self._coverages["forward"][start:end] += increment
         else:
-            self._coverages["reverse"][start:end] = [
-                coverage - increment for coverage in
-                self._coverages["reverse"][start:end]]
+            self._coverages["reverse"][start:end] -= increment
 
     def _add_first_base_coverage(self, entry, increment, start, end):
         if ((entry.is_reverse is False and entry.is_read2 is False) or
                 (entry.is_reverse is True and entry.is_read2 is True)):
-            self._coverages["forward"][start] = self._coverages[
-                "forward"][start] + increment
+            self._coverages["forward"][start] += increment
         else:
-            self._coverages["reverse"][end-1] = self._coverages[
-                "reverse"][end-1] - increment
+            self._coverages["reverse"][end-1] -= increment
 
     def _add_last_base_coverage(self, entry, increment, start, end):
         if ((entry.is_reverse is False and entry.is_read2 is False) or
                 (entry.is_reverse is True and entry.is_read2 is True)):
-            self._coverages["forward"][end-1] = self._coverages[
-                "forward"][end-1] + increment
+            self._coverages["forward"][end-1] += increment
         else:
-            self._coverages["reverse"][start] = self._coverages[
-                "reverse"][start] - increment
+            self._coverages["reverse"][start] -= increment
+
+    def _add_centered_coverage(self, entry, increment, start, end):
+        center_start = start + self._clip_length
+        center_end = end - self._clip_length
+        center_length = float(center_end - center_start)
+        if center_length < 1.0:
+            #print(entry)
+            return
+        if ((entry.is_reverse is False and entry.is_read2 is False) or
+                (entry.is_reverse is True and entry.is_read2 is True)):
+            self._coverages["forward"][center_start:center_end] += (
+                increment / center_length)
+        else:
+            self._coverages["reverse"][center_start:center_end] -= (
+                increment / center_length)
diff --git a/reademptionlib/genewisequanti.py b/reademptionlib/genewisequanti.py
@@ -5,7 +5,7 @@
 
 class GeneWiseQuantification(object):
 
-    def __init__(self, min_overlap=1, read_region="global",
+    def __init__(self, min_overlap=1, read_region="global", clip_length=11,
                  norm_by_alignment_freq=True, norm_by_overlap_freq=True,
                  allowed_features_str=None, skip_antisense=False,
                  unique_only=False):
@@ -18,6 +18,7 @@ def __init__(self, min_overlap=1, read_region="global",
         """
         self._min_overlap = min_overlap
         self._read_region = read_region
+        self._clip_length = clip_length
         self._norm_by_alignment_freq = norm_by_alignment_freq
         self._norm_by_overlap_freq = norm_by_overlap_freq
         self._allowed_features = _allowed_features(allowed_features_str)
@@ -144,6 +145,12 @@ def _overlapping_alignments(self, sam, entry):
                 if (alignment.is_reverse is True) and (
                    (start < entry.start) or (start > entry.end)):
                         continue
+            elif self._read_region == "centered":
+                if _get_overlap(start + self._clip_length,
+                                end - self._clip_length,
+                                entry.start,
+                                entry.end) < self._min_overlap:
+                    continue
             else:
                 if alignment.get_overlap(entry.start-1,
                                          entry.end) < self._min_overlap:
@@ -321,7 +328,12 @@ def _allowed_features(allowed_features_str):
         return [
             feature.strip() for feature in allowed_features_str.split(",")]
 
-   
+
 def _gff_field_descriptions():
     return ["Sequence name", "Source", "Feature", "Start", "End", "Score",
             "Strand", "Frame", "Attributes"]
+
+def _get_overlap(alignment_start, alignment_end, feature_start, feature_end):
+    return max(0, 
+               min(alignment_end, feature_end) - 
+               max(alignment_start, feature_start) + 1)
diff --git a/reademptionlib/vizdeseq.py b/reademptionlib/vizdeseq.py
@@ -31,7 +31,7 @@ def create_scatter_plots(self, plot_path):
         self._pp_scatterplots.close()
 
     def _create_scatter_plots(self, condition_1, condition_2):
-        pd.options.display.mpl_style = 'default'
+        matplotlib.style.use('ggplot')
         font = {'family': 'sans-serif', 'size': 7}
         matplotlib.rc('font', **font)
         deseq_result = pd.read_table(
@@ -41,9 +41,14 @@ def _create_scatter_plots(self, condition_1, condition_2):
                  deseq_result.log2FoldChange, ".", alpha=0.3)
         significant_deseq_result = deseq_result[
             deseq_result.padj < self._p_value_significance_limit]
+        non_significant_deseq_result = deseq_result[
+            deseq_result.padj >= self._p_value_significance_limit]
         plt.plot(
             np.log10(significant_deseq_result.baseMean),
-            significant_deseq_result.log2FoldChange, ".r", alpha=0.5)
+            significant_deseq_result.log2FoldChange, ".r", alpha=0.3)
+        plt.plot(
+            np.log10(non_significant_deseq_result.baseMean),
+            non_significant_deseq_result.log2FoldChange, ".k", alpha=0.5)
         plt.title("{} vs. {} - MA plot".format(condition_1, condition_2))
         y_max = max([abs(log2_fc)
                      for log2_fc in deseq_result.log2FoldChange])

diff --git a/reademptionlib/vizgenequanti.py b/reademptionlib/vizgenequanti.py
@@ -99,7 +99,8 @@ def plot_annotation_class_quantification(self, plot_path):
         matplotlib.rc('font', **font)
         plt.title("Number of reads per RNA classes")
         color_index = 0
-        for direction in ["sense", "anti-sense"]:
+        for direction in self._lib_names_and_class_quanti[
+                self._lib_names[0]].keys():
             for anno_class in all_classes_sorted:
                 countings = [
                     self._lib_names_and_class_quanti[lib][direction][