diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 0f542131..247788f0 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,9 @@ +v0.3.1 (2014-07-05) +- Add live / installation image (thanks anonymous reviewer #1) +- Add support for Fastq read files (thanks anonymous reviewer #3) +- Extend CLI documentation (thanks reviewer Matt MacManes) +- Fix/extend online documentation (thanks for help re. OS X to Lei Li + and the feedback to Petya Zhelyazkova) v0.3.0 (2014-05-18) - Improve Makefile - Update docs diff --git a/Makefile b/Makefile index facd3b6d..0c93d47a 100644 --- a/Makefile +++ b/Makefile @@ -54,8 +54,8 @@ new_release: @echo "* Test doc creation" @echo "* make package_to_pypi" @echo "* git add CHANGELOG.txt bin/reademption docs/source/conf.py setup.py" - @echo "* Commit changes e.g. 'git commit -m \"Set version to 0.2.X\"'" - @echo "* Tag the commit e.g. 'git tag -a v0.2.X -m \"version v0.2.X\"'" + @echo "* Commit changes e.g. 'git commit -m \"Set version to 0.3.X\"'" + @echo "* Tag the commit e.g. 'git tag -a v0.3.X -m \"version v0.3.X\"'" @echo "* Merge release into dev and master" @echo "* Push it to github: git push" @echo "* Generate a new release based on this tag at" diff --git a/bin/reademption b/bin/reademption index 85f86312..be13d8a0 100755 --- a/bin/reademption +++ b/bin/reademption @@ -9,7 +9,7 @@ __author__ = "Konrad Foerstner " __copyright__ = "2011-2014 by Konrad Foerstner " __license__ = "ISC license" __email__ = "konrad@foerstner.org" -__version__ = "0.3.0" +__version__ = "0.3.1" def main(): parser = argparse.ArgumentParser() @@ -34,10 +34,11 @@ def main(): "directory is used.") read_aligning_parser.add_argument( "--min_read_length", "-l", default=12, type=int, - help="Minimal read length after clipping.") + help="Minimal read length after clipping (default 12). Should be " + "higher for eukaryotic species.") read_aligning_parser.add_argument( "--processes", "-p", default=1, type=int, - help="Number of processes that should be used.") + help="Number of processes that should be used (default 1).") read_aligning_parser.add_argument( "--segemehl_accuracy", "-a", default=95.0, type=float, help="Segemehl's minimal accuracy (in %%) (default 95).") @@ -71,6 +72,9 @@ def main(): read_aligning_parser.add_argument( "--lack_bin", "-L", default="lack.x", help="Lack's binary path (default 'lack.x').") + read_aligning_parser.add_argument( + "--fastq", "-q", default=False, action="store_true", + help="Input reads are in FASTQ not FASTA format.") read_aligning_parser.add_argument( "--check_for_existing_files", "-f", default=False, action="store_true", help="Check for existing files (e.g. from a " @@ -108,7 +112,7 @@ def main(): "calculation.") coverage_creation_parser.add_argument( "--processes", "-p", default=1, type=int, - help="Number of processes that should be used.") + help="Number of processes that should be used (default 1).") coverage_creation_parser.add_argument( "--skip_read_count_splitting", "-s", default=False, action="store_true", help="Do not split the read counting between " @@ -151,7 +155,7 @@ def main(): "and anti-sense overlaps are counted and separately reported.") gene_wise_quanti_parser.add_argument( "--processes", "-p", default=1, type=int, - help="Number of processes that should be used.") + help="Number of processes that should be used (default 1).") gene_wise_quanti_parser.add_argument( "--features", "-t", dest="allowed_features", default=None, help="Comma separated list of features that should be considered " diff --git a/docs/source/conf.py b/docs/source/conf.py index d6ad2749..c7efcacc 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -50,7 +50,7 @@ # The short X.Y version. version = '0.3' # The full version, including alpha/beta/rc tags. -release = '0.3.0' +release = '0.3.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/source/example_analysis.rst b/docs/source/example_analysis.rst index 9f7ce17c..1bc12978 100644 --- a/docs/source/example_analysis.rst +++ b/docs/source/example_analysis.rst @@ -103,7 +103,10 @@ Finally, we need the reads of the RNA-Seq libraries. To save some time for running this examples we will work with subsampled libraries of 1M reads each. This will the limit informative value of the results which is acceptable as we just want to understand the workflow of the -READemption. +READemption. Please be aware that READemption does not perform quality +trimming or adapter clipping so far. For this purpose use the `FASTX +toolkit `_, `cutadapt +`_ or other tools. :: diff --git a/docs/source/index.rst b/docs/source/index.rst index e47b414c..f88ec6ea 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -8,6 +8,7 @@ Table of content index installation + live_and_installation_image subcommands example_analysis troubleshooting diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 91ec4a61..d4920b41 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -1,13 +1,14 @@ -Installation -============ +Installation and updating +========================= Requirements ------------ -READemption was developed using Python 3.3 and the user is advised to -run READemption with this or a higher version. In any case, the third -party packages `pysam `_ as well as -`setuptools `_ and `pip +READemption was started to be developed using Python 3.2 and the user +is advised to run READemption with this or a higher version. In any +case, the third party packages `pysam +`_ as well as `setuptools +`_ and `pip `_ should be available on the system in order to make the installation easy. READemption uses the short read mapper `segemehl @@ -21,8 +22,8 @@ necessary for the subcommand ``deseq`` which performs differential gene expression analysis. Don't worry - in the following the installation of all these requirements will be covered. -Installing on a fresh Ubuntu installation ------------------------------------------ +Installing on a fresh Ubuntu system +----------------------------------- The following installation procedure was tested on a `Amazon AWS t1.micro @@ -53,8 +54,8 @@ Some comments: :: - curl http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_1_7.tar.gz > segemehl_0_1_7.tar.gz - tar xzf segemehl_0_1_7.tar.gz + curl http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_1_9.tar.gz > segemehl_0_1_9.tar.gz + tar xzf segemehl_0_1_9.tar.gz cd segemehl_*/segemehl/ && make && cd ../../ Copying the executable to a location that is part of the ``PATH`` e.g @@ -62,13 +63,13 @@ Copying the executable to a location that is part of the ``PATH`` e.g :: - sudo cp segemehl_0_1_7/segemehl/segemehl.x /usr/bin/segemehl.x - sudo cp segemehl_0_1_7/segemehl/lack.x /usr/bin/lack.x + sudo cp segemehl_0_1_9/segemehl/segemehl.x /usr/bin/segemehl.x + sudo cp segemehl_0_1_9/segemehl/lack.x /usr/bin/lack.x ... or the bin folder of your home directory:: mkdir ~/bin - cp segemehl_0_1_7/segemehl/segemehl.x ~/bin + cp segemehl_0_1_9/segemehl/segemehl.x ~/bin 3. Install DESeq2 ~~~~~~~~~~~~~~~~~ @@ -82,7 +83,7 @@ and install the DESeq2 package inside of the interactive command line interface. You might be asked to confirm the installation path:: source("http://bioconductor.org/biocLite.R") - biocLite("DESeq2"). + biocLite("DESeq2") Leave ``R``:: @@ -101,12 +102,49 @@ Voilà! You should now be able to call READemption:: reademption -h -.. -.. Global installation -.. ------------------- -.. -.. Installation in the home directory of the user -.. ---------------------------------------------- -.. -.. Installation in a pyvenv -.. ---------------------- + +Installing on a Apple OS X +-------------------------- + +(Many thanks to Lei Li for contribution this part!) + +1. Installing all required software/packages +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To download and install Python 3 follow the instruction at this +`download page `_. + +Download and install `xcode `_ (`download page `_) and R +(download links are on `the frontpage `_). + +To install ``pip`` open a terminal and run + +:: + + curl -O https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py python3 ez_setup.py # download and install pip + curl -O https://raw.github.com/pypa/pip/master/contrib/get-pip.py + python3 get-pip.py + +Install ``matplotlib``: + +:: + + pip3 install python3-matplotlib + + +1. Installing segemehl, DESeq, pysam and READemption +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The remaining installation steps are the same as descibed above. Just +open a terminal and run the commands. + + +Updating READemption +-------------------- + +Once you have installed READemption as described above you can easily +update it to the newest version by running + +:: + + pip3 install READemption -U diff --git a/docs/source/live_and_installation_image.rst b/docs/source/live_and_installation_image.rst new file mode 100644 index 00000000..857b2e45 --- /dev/null +++ b/docs/source/live_and_installation_image.rst @@ -0,0 +1,19 @@ +Live system and installation image +================================== + +For users who do not feel comfortable with the installation of +READemption, who want to run it under Windows or who just want to test +it without installation we offer a Ubuntu 14.4 based live system / +installation image (~ 1.2 GB large) which can be `downloaded here +`_. Once you have retrieved the image +you can either use it to install or test the system on a physical +machine or use it in a virtual machine (e.g. using the open source +software `VirtualBox `_). The required +installation/setup steps are the `same as for the official Ubuntu +version +`_. The +only difference is that once you have a running system READemption can +be used out of the box without any further setup. A very detailed +description of the setup of a virtual machine with Ubuntu under +Windows can be found `here +`_. diff --git a/docs/source/subcommands.rst b/docs/source/subcommands.rst index 0e316532..5d25012d 100644 --- a/docs/source/subcommands.rst +++ b/docs/source/subcommands.rst @@ -31,20 +31,25 @@ annotation files in GFF3 format have to be put into align ----- -``align`` performs the clipping and size filtering of the reads, as well -as the actual aligning to the reference sequences. It also generates -statistics about the steps (e.g. number of aligned reads, number of -mappings). As the result of this steps are needed by the other -subcommands it has to be run before the others. It requires reads in -FASTA format (or counterparts compressed with ``gzip`` or ``bzip2``) -and reference sequences in FASTA format. ``align`` generates the read -alignments in BAM format (``*.bam``) and also index files for those -(``*.bam.bai``). Is also stores unmapped reads so that they can be -inspected e.g. to search for contaminations. The file +``align`` performs the clipping and size filtering of the reads, as +well as the actual aligning to the reference sequences. It also +generates statistics about the steps (e.g. number of aligned reads, +number of mappings). As the result of this steps are needed by the +other subcommands it has to be run before the others. It requires +reads in FASTA or FASTQ format (or counterparts compressed with +``gzip`` or ``bzip2``) and reference sequences in FASTA +format. ``align`` generates the read alignments in BAM format +(``*.bam``) and also index files for those (``*.bam.bai``). Is also +stores unmapped reads so that they can be inspected e.g. to search for +contaminations. The file ``output/align/reports_and_stats/read_alignment_stats.csv`` lists several mapping statistics. The folder -``output/align/reports_and_stats/stats_data_json/`` contains files with -the original countings in JSON format. +``output/align/reports_and_stats/stats_data_json/`` contains files +with the original countings in JSON format. Please be aware that +READemption does not perform quality trimming or adapter clipping so +far. For this purpose use the `FASTX toolkit +`_, `cutadapt +`_ or other tools. :: @@ -55,19 +60,21 @@ the original countings in JSON format. [--segemehl_bin SEGEMEHL_BIN] [--paired_end] [--split] [--poly_a_clipping] [--realign] [--keep_original_alignments] [--lack_bin LACK_BIN] - [--check_for_existing_files] [--progress] + [--fastq] [--check_for_existing_files] [--progress] + [--crossalign_cleaning CROSSALIGN_CLEANING_STRING] [project_path] - + positional arguments: project_path Path of the project folder. If none is given the current directory is used. - + optional arguments: -h, --help show this help message and exit --min_read_length MIN_READ_LENGTH, -l MIN_READ_LENGTH - Minimal read length after clipping. + Minimal read length after clipping (default 12). + Should be higher for eukaryotic species. --processes PROCESSES, -p PROCESSES - Number of processes that should be used. + Number of processes that should be used (default 1). --segemehl_accuracy SEGEMEHL_ACCURACY, -a SEGEMEHL_ACCURACY Segemehl's minimal accuracy (in %) (default 95). --segemehl_evalue SEGEMEHL_EVALUE, -e SEGEMEHL_EVALUE @@ -91,6 +98,7 @@ the original countings in JSON format. (lack) after merging. --lack_bin LACK_BIN, -L LACK_BIN Lack's binary path (default 'lack.x'). + --fastq, -q Input reads are in FASTQ not FASTA format. --check_for_existing_files, -f Check for existing files (e.g. from a interrupted previous run) and do not overwrite them if they exits. @@ -104,7 +112,6 @@ the original countings in JSON format. org_1_repl1>,,..,;:,,..,' - coverage -------- @@ -159,7 +166,7 @@ positions. To turn off this behavior use number of aligned reads even if only uniquely aligned reads are used for the coverage calculation. --processes PROCESSES, -p PROCESSES - Number of processes that should be used. + Number of processes that should be used (default 1). --skip_read_count_splitting, -s Do not split the read counting between different alignings. Default is to do the splitting. @@ -216,7 +223,7 @@ overlaps are counted and separately listed. sense and anti-sense overlaps are counted and separately reported. --processes PROCESSES, -p PROCESSES - Number of processes that should be used. + Number of processes that should be used (default 1). --features ALLOWED_FEATURES, -t ALLOWED_FEATURES Comma separated list of features that should be considered (e.g. gene, cds, region, exon). Other diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py index 2c6c8d8e..dec0ccfe 100644 --- a/reademptionlib/controller.py +++ b/reademptionlib/controller.py @@ -53,8 +53,6 @@ def create_project(self, version): project_creator.create_root_folder(self._args.project_path) project_creator.create_subfolders(self._paths.required_folders()) project_creator.create_version_file(self._paths.version_path, version) - sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % ( - self._args.project_path)) sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % ( self._args.project_path)) sys.stdout.write("Please copy read files into folder \"%s\" and " @@ -253,8 +251,9 @@ def _prepare_reads_single_end(self): continue read_processor = ReadProcessor( poly_a_clipping=self._args.poly_a_clipping, - min_read_length=self._args.min_read_length) - read_files_and_jobs[lib_name] = executor.submit( + min_read_length=self._args.min_read_length, + fastq=self._args.fastq) + read_files_and_jobs[lib_name] = executor.submit( read_processor.process_single_end, read_path, processed_read_path) self._evaluet_job_and_generate_stat_file(lib_name, read_files_and_jobs) @@ -272,7 +271,8 @@ def _prepare_reads_paired_end(self): continue read_processor = ReadProcessor( poly_a_clipping=False, - min_read_length=self._args.min_read_length) + min_read_length=self._args.min_read_length, + fastq=self._args.fastq) read_files_and_jobs[lib_name] = executor.submit( read_processor.process_paired_end, read_path_pair, processed_read_path_pair) @@ -295,7 +295,7 @@ def _evaluet_job_and_generate_stat_file( read_files_and_stats, self._paths.read_processing_stats_path) def _align_single_end_reads(self): - """Manage the actual alignement of single end reads.""" + """Manage the actual alignment of single end reads.""" read_aligner = ReadAligner(self._args.segemehl_bin, self._args.progress) if self._file_needs_to_be_created(self._paths.index_path) is True: read_aligner.build_index( diff --git a/reademptionlib/fastq.py b/reademptionlib/fastq.py new file mode 100644 index 00000000..99a56683 --- /dev/null +++ b/reademptionlib/fastq.py @@ -0,0 +1,47 @@ +class FastqParser(object): + """A class for parsing and handling fastQ files. + + Currently only taking care of the sequences not the quality + strings. Only four line entries can be handled + """ + + def entries(self, fastq_fh): + """Go line by line though the file and return fastq entries. """ + current_header = '' + current_sequence = '' + # Switch telling if this the first entry + virgin = True + entry_line_counter = 0 + for line in fastq_fh: + # For usual headers + if line[0] == '@' and virgin == False and entry_line_counter == 4: + entry_line_counter = 1 + yield(current_header, current_sequence) + current_header = line[1:-1] + current_sequence = '' + # For the very first header of the file + elif line[0] == '@' and virgin == True: + virgin = False + current_header = line[1:-1] + entry_line_counter = 1 + # For sequence lines + else: + entry_line_counter += 1 + if entry_line_counter == 2: + current_sequence = line[:-1] + # For the last entry if file was not empty + if not (current_header == '' and current_sequence == ''): + yield(current_header, current_sequence) + + def single_entry_file_header(self, fastq_fh): + first_line = fastq_fh.readline() + header = first_line[1:-1] + fastq_fh.seek(0) + return header + + def header_id(self, header): + """Return only the id of a fastq header + + Everything after the first white space is discard. + """ + return header.split()[0] diff --git a/reademptionlib/readprocessor.py b/reademptionlib/readprocessor.py index d3c5d9b9..cfdf729b 100644 --- a/reademptionlib/readprocessor.py +++ b/reademptionlib/readprocessor.py @@ -2,16 +2,17 @@ import bz2 from collections import defaultdict from reademptionlib.fasta import FastaParser +from reademptionlib.fastq import FastqParser from reademptionlib.polyaclipper import PolyAClipper class ReadProcessor(object): def __init__(self, poly_a_clipping=False, min_read_length=12, - paired_end=False): + paired_end=False, fastq=False): self._poly_a_clipping = poly_a_clipping self._min_read_length = min_read_length self._paired_end = paired_end - self._fasta_parser = FastaParser() + self._fastq = fastq self._poly_a_clipper = PolyAClipper() def process_single_end(self, input_path, output_path): @@ -60,7 +61,7 @@ def _line_interator(self, input_path, open_func): yield(line.decode()) def _process_single_end(self, input_fh, output_fh): - for header, seq in self._fasta_parser.entries(input_fh): + for header, seq in self._seq_parser().entries(input_fh): self._stats["total_no_of_reads"] += 1 if self._poly_a_clipping: clipped_seq = self._poly_a_clipper.clip_poly_a_strech(seq) @@ -89,8 +90,8 @@ def _process_single_end(self, input_fh, output_fh): def _process_paired_end( self, input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh): for fasta_entry_p1, fasta_entry_p2 in zip( - self._fasta_parser.entries(input_p1_fh), - self._fasta_parser.entries(input_p2_fh)): + self._seq_parser().entries(input_p1_fh), + self._seq_parser().entries(input_p2_fh)): header_p1 = fasta_entry_p1[0] header_p2 = fasta_entry_p2[0] seq_p1 = fasta_entry_p1[1] @@ -115,3 +116,8 @@ def _process_paired_end( # Encoding to bytes is necessary due to saving via gzip output_p1_fh.write(str.encode(">%s\n%s\n" % (header_p1, seq_p1))) output_p2_fh.write(str.encode(">%s\n%s\n" % (header_p2, seq_p2))) + + def _seq_parser(self): + if self._fastq: return FastqParser() + else: return FastaParser() + diff --git a/reademptionlib/vizgenequanti.py b/reademptionlib/vizgenequanti.py index af8a9d32..581e60ce 100644 --- a/reademptionlib/vizgenequanti.py +++ b/reademptionlib/vizgenequanti.py @@ -92,7 +92,7 @@ def plot_annotation_class_quantification(self, plot_path): bottom = np.array([0] * no_of_libs) fig = plt.figure() ax = plt.subplot(111) - font = {'family' : 'sans-serif', 'weight' : 'normal', 'size' : 6} + font = {'family' : 'sans-serif', 'weight' : 'normal', 'size' : 6} matplotlib.rc('font', **font) plt.title("Number of reads per RNA classes") color_index = 0 diff --git a/setup.py b/setup.py index 0fa4f71c..e0562715 100644 --- a/setup.py +++ b/setup.py @@ -5,14 +5,14 @@ setup( name='READemption', - version='0.3.0', + version='0.3.1', packages=['reademptionlib', 'tests'], author='Konrad U. Förstner', author_email='konrad@foerstner.org', description='A RNA-Seq Analysis Pipeline', url='', install_requires=[ - "pysam >= 0.7.7" + "pysam >= 0.7.8" ], scripts=['bin/reademption'], license='ISC License (ISCL)', diff --git a/tests/test_controller.py b/tests/test_controller.py index c35560bf..68c8ed7b 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -20,6 +20,7 @@ class ArgMock(object): split = False realign = False crossalign_cleaning_str = None + fastq = False class TestController(unittest.TestCase): diff --git a/tests/test_fastq.py b/tests/test_fastq.py new file mode 100644 index 00000000..d92372d9 --- /dev/null +++ b/tests/test_fastq.py @@ -0,0 +1,65 @@ +from io import StringIO +import unittest +import sys +sys.path.append(".") +from reademptionlib.fastq import FastqParser + +class TestFastqParser(unittest.TestCase): + + def setUp(self): + self.fastq_parser = FastqParser() + self.example_data = ExampleData() + + def test_parse_1(self): + fastq_fh = StringIO(self.example_data.fastq_seqs_1) + self.assertEqual( + list(self.fastq_parser.entries(fastq_fh)), + [('test_1 a random sequence', 'TTTAGAAATTACACA'), + ('test_2 another random sequence', 'ACGAGAAATTAAATTAAATT'), + ('test_3 another random sequence', 'TAGAGACATTGGATTTTATT')]) + + def test_parse_empty_file(self): + fastq_fh = StringIO("") + self.assertEqual( + list(self.fastq_parser.entries(fastq_fh)), []) + + def test_single_entry_file_header(self): + fastq_fh = StringIO(self.example_data.fastq_seqs_2) + self.assertEqual(self.fastq_parser.single_entry_file_header(fastq_fh), + "test_4 a random sequence") + + def test_header_id_1(self): + self.assertEqual( + self.fastq_parser.header_id("seq_10101 An important protein"), + "seq_10101") + + def test_header_id_2(self): + self.assertEqual( + self.fastq_parser.header_id("seq_10101\tAn important protein"), + "seq_10101") + +class ExampleData(object): + + fastq_seqs_1 = """@test_1 a random sequence +TTTAGAAATTACACA ++ +!!!!!!!!!!!!!!! +@test_2 another random sequence +ACGAGAAATTAAATTAAATT ++ +@@@@@@@@@@@@@@@@@@@@ +@test_3 another random sequence +TAGAGACATTGGATTTTATT ++ +!!!!!!!!!!!!!!!!!!!! +""" + + fastq_seqs_2 = """@test_4 a random sequence +TTTAGAAATTACACA ++ +!!!!!!!!!!!!!!! +""" + +if __name__ == "__main__": + unittest.main() +