From 2b56aa9a5cca65d567fa98bdba25be2831af0b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 30 Mar 2014 09:13:22 +0200 Subject: [PATCH 01/11] Set version to 0.2.1dev --- bin/reademption | 2 +- docs/source/conf.py | 4 ++-- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/reademption b/bin/reademption index e8b2b0fa..7558667e 100755 --- a/bin/reademption +++ b/bin/reademption @@ -9,7 +9,7 @@ __author__ = "Konrad Foerstner " __copyright__ = "2011-2013 by Konrad Foerstner " __license__ = "ISC license" __email__ = "konrad@foerstner.org" -__version__ = "0.2.0" +__version__ = "0.2.1dev" def main(): parser = argparse.ArgumentParser() diff --git a/docs/source/conf.py b/docs/source/conf.py index 1a27e211..7cdc11d5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -48,9 +48,9 @@ # built documents. # # The short X.Y version. -version = '0.1' +version = '0.2' # The full version, including alpha/beta/rc tags. -release = '0.1' +release = '0.2.1dev' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 9e7bc5da..0fda37ff 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='READemption', - version='0.2.0', + version='0.2.1dev', packages=['reademptionlib', 'tests'], author='Konrad U. Förstner', author_email='konrad@foerstner.org', From 4c89a8a7515f5e65a33815543fa391214ec1076c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 30 Mar 2014 09:13:55 +0200 Subject: [PATCH 02/11] Add tasks --- Makefile | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 92061566..ff46ddd2 100644 --- a/Makefile +++ b/Makefile @@ -8,14 +8,22 @@ coverage: package: python3.3 setup.py sdist +package_to_pypi: + python setup.py sdist upload + @echo "Go to https://pypi.python.org/pypi/READemption/" + html_doc: cd docs && make html && cd .. +upload_doc: + cd docs/build/html/ && zip -r READemption_docs.zip * && cd ../../.. && mv docs/build/html/READemption_docs.zip . + @echo "Upload at https://pypi.python.org/pypi?%3Aaction=pkg_edit&name=READemption" + show_html_docs: firefox docs/build/html/index.html & readme_txt: - pandoc --from=markdown --to=plain README.md -o README.tex + pandoc --from=markdown --to=plain README.md -o README.txt readme_html: pandoc --from=markdown --to=html README.md -o README.html @@ -25,6 +33,19 @@ readme_rst: readme_clean: rm -f README.tex README.html README.rst + rm -f README.tex README.html README.txt pylint: pylint bin/reademption reademptionlib/* tests/* + +new_release: + @echo "* Please do this manually:" + @echo "* ------------------------" + @echo "* Change bin/reademption" + @echo "* Change setup.py" + @echo "* Change docs/source/conf.py" + @echo "* Change CHANGELOG.txt" + @echo "* Commit changes e.g. 'git commit -m 'Set version to 0.2.0'" + @echo "* Tag the commit e.g. 'git tag -a v0.1.9 -m 'version v0.1.9''" + @echo "* After pushing generate a new release based on this tag at" + @echo " https://github.com/konrad/READemption/releases/new" From d5bd253aa212305b4553f1e139e02e4637e28c31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 30 Mar 2014 09:15:21 +0200 Subject: [PATCH 03/11] Add OS classifier --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9e7bc5da..a32a880c 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ classifiers=[ 'License :: OSI Approved :: ISC License (ISCL)' 'Programming Language :: Python :: 3', - 'Topic :: Scientific/Engineering :: Bio-Informatics' + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Operating System :: POSIX' ] ) From 0b371ef8ff50e0241f9f7f849c2376064d1a355d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 30 Mar 2014 09:15:57 +0200 Subject: [PATCH 04/11] Remove MANISTE file --- MANIFEST | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 MANIFEST diff --git a/MANIFEST b/MANIFEST deleted file mode 100644 index 6b33c948..00000000 --- a/MANIFEST +++ /dev/null @@ -1,39 +0,0 @@ -# file GENERATED by distutils, do NOT edit -README.txt -setup.py -bin/rapl.py -rapl/__init__.py -rapl/controller.py -rapl/coveragecalculator.py -rapl/deseq.py -rapl/fasta.py -rapl/genewisequanti.py -rapl/gff3.py -rapl/parameterlog.py -rapl/paths.py -rapl/polyaclipper.py -rapl/projectcreator.py -rapl/rawstatdata.py -rapl/readaligner.py -rapl/readalignerstats.py -rapl/readalignerstatstable.py -rapl/readclipper.py -rapl/readprocessor.py -rapl/sambamconverter.py -rapl/segemehl.py -rapl/vizalign.py -rapl/vizdeseq.py -rapl/vizgenequanti.py -rapl/wiggle.py -test/test_all.py -test/test_controller.py -test/test_coveragecalculator.py -test/test_fasta.py -test/test_genewisequanti.py -test/test_gff3.py -test/test_paths.py -test/test_polyaclipper.py -test/test_projectcreator.py -test/test_readalignerstats.py -test/test_readclipper.py -test/test_segemehl.py From b02d2dbbd558cf8e682a72d359d0d33539faeae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 30 Mar 2014 09:17:05 +0200 Subject: [PATCH 05/11] Use README.rst for long description --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a32a880c..234b0253 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ ], scripts=['bin/reademption'], license='LICENSE.txt', - long_description=open('README.txt').read(), + long_description=open('README.rst').read(), classifiers=[ 'License :: OSI Approved :: ISC License (ISCL)' 'Programming Language :: Python :: 3', From bf97793908052961f066f70d454b31abd199634d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 30 Mar 2014 09:18:54 +0200 Subject: [PATCH 06/11] Add documentation and installation links --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 988c115c..c5d69407 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,21 @@ Once the input files are copied into defined folders no further parameters have to be given. Still, READemption's behavior can be adapted to specific needs of the user. +Documentation +------------- + +Documentation can be found on [here](http://pythonhosted.org/READemption/). + +Installation +------------ + +Short version (if you have all the requirements installed): + + $ pip install READemption + +[Long version](http://pythonhosted.org/READemption/installation.html) +(what are the requirements and how do you get them) + License ------- From cc825ffe636ba3b6ee673985e14d401b2e6e6361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 30 Mar 2014 09:24:13 +0200 Subject: [PATCH 07/11] Add badges --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index c5d69407..a6d6feec 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +[![Latest Version](https://pypip.in/version/READemption/badge.png)](https://pypi.python.org/pypi/READemption/) +[![License](https://pypip.in/license/READemption/badge.png)](https://pypi.python.org/pypi/READemption/) +[![Downloads](https://pypip.in/d/READemption/badge.png)](https://pypi.python.org/pypi/READemption/) + About ----- From 916c5d96699fef82477dd4638c4c968e15223f89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 30 Mar 2014 09:25:35 +0200 Subject: [PATCH 08/11] Sort classifiers --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 234b0253..30dd349f 100644 --- a/setup.py +++ b/setup.py @@ -19,8 +19,8 @@ long_description=open('README.rst').read(), classifiers=[ 'License :: OSI Approved :: ISC License (ISCL)' + 'Operating System :: POSIX' 'Programming Language :: Python :: 3', 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Operating System :: POSIX' ] ) From de5e2398bf33ecae69717b5843343d58cb287f80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Mon, 31 Mar 2014 22:49:36 +0200 Subject: [PATCH 09/11] Improve documentation --- docs/source/example_analysis.rst | 251 ++++++++++++++++++--- docs/source/index.rst | 12 +- docs/source/installation.rst | 105 +++++---- docs/source/subcommands.rst | 375 ++++++++++++++++++++----------- 4 files changed, 523 insertions(+), 220 deletions(-) diff --git a/docs/source/example_analysis.rst b/docs/source/example_analysis.rst index e8254a2c..2e9369a8 100644 --- a/docs/source/example_analysis.rst +++ b/docs/source/example_analysis.rst @@ -1,60 +1,259 @@ -Performing a standard analysis +Performing an example analysis ============================== -In the following we will run a small example analysis using a publicly -available RNA-Seq data set. +Here you will be guided trough a small example analysis using a +publicly available RNA-Seq data set. We will use a data set from NCBI +GEO that was part of a publication by `Kröger et +al. `_. This is a +transcriptome analysis of *Salmonella* Typhimurium SL1344 in different +conditions. + +We will generate several output files in different formats. The CSV +(tabular separated plain text files) files can be opened with any +spreadsheet program like `LibreOffice `_ +or Excel. For inspecting the mappings (in BAM format) and coverage +files (wiggle format) you can use a genome browser for example `IGB +`_ or `IGV +`_. Generating a project -------------------- -Creating a new project:: +At first we have to create the analysis folder and its subfolder. For +this we use the ``create`` subcommand:: - $ reademption create my_rna_seq_analysis - Created folder "my_rna_seq_analysis" and required subfolders. - Please copy read files into folder "my_rna_seq_analysis/input/reads" and reference sequences files into folder "my_rna_seq_analysis/input/reference_sequences". + $ reademption create READemption_analysis + Created folder "READemption_analysis2" and required subfolders. + Please copy read files into folder "READemption_analysis2/input/reads" and reference sequences files into folder "READemption_analysis2/input/reference_sequences". +This will result in a folder structure as shown here: +:: - $ ls my_rna_seq_analysis/* - my_rna_seq_analysis/input: - annotation_files reads reference_sequences + READemption_analysis + ├── input + │   ├── annotation_files + │   ├── reads + │   └── reference_sequences + └── output + ├── align + │   ├── alignments + │   ├── index + │   ├── processed_reads + │   ├── reports_and_stats + │   │   ├── stats_data_json + │   │   └── used_reademption_version.txt + │   └── unaligned_reads + ├── coverage + │   ├── coverage-raw + │   ├── coverage-tnoar_mil_normalized + │   └── coverage-tnoar_min_normalized + ├── deseq + │   ├── deseq_raw + │   └── deseq_with_annotations + ├── gene_quanti + │   ├── gene_quanti_combined + │   └── gene_quanti_per_lib + ├── viz_align + ├── viz_deseq + └── viz_gene_quanti - my_rna_seq_analysis/output: - coverages-raw deseq_comparisons read_alignments-index reports_and_stats - coverages-tnoar_mil_normalized gene_wise_quantifications read_alignments-processed_reads stats_data_json - coverages-tnoar_min_normalized read_alignments-alignments read_alignments-unaligned_reads Retrieving the input data ------------------------- +We have to download the reference sequence (FASTA format) as well as +the annotation file (GFF3 format) for *Salmonella* from NCBI. As we +will use the URL of *Salmonella* Typhimurium SL1344's source FTP +folder it several times we store it in an environment variable called +``FTP_SOURCE``. -Store the URL of source FTP folder in an environment variable. :: - $ FTP_SOURCE=ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Salmonella_enterica_serovar_Typhimurium_SL1344_uid86645 -Download the reference sequence (the chromosome and three plasmids) in Fasta format. + $ FTP_SOURCE=ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Salmonella_enterica_serovar_Typhimurium_SL1344_uid86645 + +We download the reference sequence (the chromosome and three plasmids) +in FASTA format and store them in the ``reference_sequences`` +folder. The files are saved with a different suffix (``.fa`` instead +of ``.fna``) as some genome browser (e.g. IGB) will not accept them as +FASTA files otherwise. + +:: + + $ wget -O READemption_analysis/input/reference_sequences/NC_016810.fa $FTP_SOURCE/NC_016810.fna + $ wget -O READemption_analysis/input/reference_sequences/NC_017718.fa $FTP_SOURCE/NC_017718.fna + $ wget -O READemption_analysis/input/reference_sequences/NC_017719.fa $FTP_SOURCE/NC_017719.fna + $ wget -O READemption_analysis/input/reference_sequences/NC_017720.fa $FTP_SOURCE/NC_017720.fna + +We have to modify the header of the FASTA files as the sequence IDs +have to be the same as the ones in the first column of the GGF3 files +(see below) to be used in the gene quantification. This will be also +necessary if both, FASTA and GFF3 files, will be loaded in the IGB. + +:: + + $ sed -i "s/>/>NC_016810.1 /" READemption_analysis/input/reference_sequences/NC_016810.fa + $ sed -i "s/>/>NC_017718.1 /" READemption_analysis/input/reference_sequences/NC_017718.fa + $ sed -i "s/>/>NC_017719.1 /" READemption_analysis/input/reference_sequences/NC_017719.fa + $ sed -i "s/>/>NC_017720.1 /" READemption_analysis/input/reference_sequences/NC_017720.fa + +Then we download the GFF3 files that contain the annotations. +:: + + $ wget -P READemption_analysis/input/annotations $FTP_SOURCE/*gff + +Finally, we need the reads of the RNA-Seq libraries. To save some time +for running this examples we will work with subsampled libraries of 1M +reads each. This will the limit informative value of the results which +is acceptable as we just want to understand the workflow of the +READemption. + :: - $ wget -cP my_rna_seq_analysis/input/reference_sequences ${FTP_SOURCE}/NC_016810.fna ... -Download the annotation for in GFF format. + $ wget -P READemption_analysis/input/reads http://reademptiondata.imib-zinf.net/InSPI2_R1.fa.bz2 + $ wget -P READemption_analysis/input/reads http://reademptiondata.imib-zinf.net/InSPI2_R2.fa.bz2 + $ wget -P READemption_analysis/input/reads http://reademptiondata.imib-zinf.net/LSP_R1.fa.bz2 + $ wget -P READemption_analysis/input/reads http://reademptiondata.imib-zinf.net/LSP_R2.fa.bz2 + +We have now all the necessary data available. The input folder should +look like this now: + :: - $ wget -cP ${FTP_SOURCE}/NC_016810.gff ... -Download the reads in FASTA format + $ ls READemption_analysis/input/* + READemption_analysis/input/annotations: + NC_016810.gff NC_017718.gff NC_017719.gff NC_017720.gff + + READemption_analysis/input/reads: + InSPI2_R1.fa.bz2 InSPI2_R2.fa.bz2 LSP_R1.fa.bz2 LSP_R2.fa.bz2 + + READemption_analysis/input/reference_sequences: + NC_016810.fa NC_017718.fa NC_017719.fa NC_017720.fa + +Processing and aligning the reads +--------------------------------- + +The first step it the read processing and mapping. Via parameters we +tell READemption to use 4 CPU (``-p 4``) and perform a poly-A-clipping +(``--poly_a_clipping``) before the mapping. + :: - $ wget ${FTP_SOURCE}/NC_016810.fna ... -Aligning the reads to the reference genome ------------------------------------------- + $ reademption align -p 4 --poly_a_clipping READemption_analysis + +Once this the mapping is done the file ``read_alignment_stats.csv`` is +created which can be found in +``READemption_analysis/output/align/reports_and_stats/``. It contains +several mapping statistics for example how many reads are successfully +aligned in total and how many were aligned to each replicon. We see +that more than 98 % are mapped for each library. Sorted and indexed +alignements in BAM format are stored in +``READemption_analysis/output/align/alignments``. We could load them +in a genome browser but instead we continue with the next step. + Generating coverage files ------------------------- -`Integrated genome browser (IGB) `_ -`Integrative genome viewer (IGV) `_ +In order to generate strand specific coverage files with different +normalizations we use the subcommand ``coverage``. + +:: + + $ reademption coverage -p 4 READemption_analysis +The sets are stored in subfolder of +``READemption_analysis/output/coverage/``. The most often set is +stored in ``coverage-tnoar_min_normalized``. Here the coverages are +normalized by the total number of aligned reads (TNOAR) of the +individual library and then multiplied by the lowest TNOAR value of +all libs. These files could be inspected for differential RNA-Seq +(dRNA-Seq - comparing libraries with and without Terminator +Exonuclease treatment) data in order to determine transcriptional +start sites. They can be loaded in common genome browsers like `IGB +`_ or `IGV +`_. Keep in mind that the +coverages of the reverse strand have negative values so you have to +adapt the scaling in some genome browsers. Performing gene wise quantification ----------------------------------- +In this step we want to quantify the number of reads overlapping with +the locations of the annotation entries. With the ``--features`` +parameter we configure ``reademption`` to just quantify CDS, tRNA and +rRNA entries. + +:: + + $ reademption gene_quanti -p 4 --features CDS,tRNA,rRNA READemption_analysis + +After the quantification we find tables that contain the combined +counting for all entries in +``READemption_analysisoutput/gene_quanti/gene_quanti_combined/``. The +countings for mappings in sense and anti-sense are separately +listed. Besides the raw countings there are also tables for +countings normalized by the total number of reads and RPKM values. + Performing differential gene expression analysis ------------------------------------------------ + +To compare the gene expression of different conditions we apply the +subcommand ``deseq`` which makes use of the R library `DESeq2 +`_. + +:: + + $ reademption deseq \ + -l InSPI2_R1.fa.bz2,InSPI2_R2.fa.bz2,LSP_R1.fa.bz2,LSP_R2.fa.bz2 \ + -c InSPI2,InSPI2,LSP,LSP READemption_analysis + +:: + +We have to tell READemption which libraries are replicates of which +condition. This is done by the parameter ``-l`` and ``-c``. ``-l`` +should hold a comma separated list of the libraries and ``-c`` the +corresponding conditions. In our case we have 4 libraries +(``InSPI2_R1.fa.bz2``, ``InSPI2_R2.fa.bz2``, ``LSP_R1.fa.bz2``, +``LSP_R2.fa.bz2``) and two condition (which we call ``InSPI2`` and +``LSP``). Just to make this association easier to understand: + +:: + + InSPI2_R1.fa.bz2 InSPI2_R2.fa.bz2 LSP_R1.fa.bz2 LSP_R2.fa.bz2 + | | | | + InSPI2 InSPI2 LSP LSP + +When you call ``deseq`` it will compare all conditions with each other +and you can pick the comparison that you need. The raw ``DESeq2`` +results are enriched with the original annotation information and are +stored in +``READemption_analysis/output/deseq/deseq_with_annotations/`` + +Create plots +------------ + +Finally we generate plots that visualize the results of the different +steps. ``viz_align`` will create histograms of the read length +distribution for the untreated and treated reads (saved in +``READemption_analysis/output/viz_align/``). + +:: + + $ reademption viz_align READemption_analysis + +``viz_gene_quanti`` visualizes the gene wise countings. In our example +you will see that - as expected - the replicates are more similar to +each other than to the libs of the other condition. It also generates +bar plot that show the distribution of reads inside the different RNA +classes. + +:: + + $ reademption viz_gene_quanti READemption_analysis + +``viz_deseq`` generates MA-plots as well as volcano plots. + +:: + + $ reademption viz_deseq READemption_analysis + diff --git a/docs/source/index.rst b/docs/source/index.rst index 66027ecd..1ef7cf24 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -34,7 +34,7 @@ over configuration*: Once the input files are copied into defined folders no further parameters have to be given. Still, READemption's behavior can be adapted to specific needs of the user. This tools is available as open source under open source license `ICSL -`_ . +`_. Download ======== @@ -51,12 +51,14 @@ The source code of READemption can be found at https://github.com/konrad/READemp Cite ==== -If you apply READemption in you data analysis please cite the +If you apply READemption in your data analysis please cite the following reference: *READemption – A tool for the computational analysis of deep-sequencing-based transcriptome data*. -Konrad U. Förstner, Jörg Vogel, Cynthia M. Sharma; (submitted). A -`pre-preprint version `_ of -the manuscript is hosted at bioRxiv. +Konrad U. Förstner, Jörg Vogel, Cynthia M. Sharma; (submitted). + +.. A +.. `pre-preprint version `_ of +.. the manuscript is hosted at bioRxiv. Contact ======= diff --git a/docs/source/installation.rst b/docs/source/installation.rst index af8c5422..037b8f5a 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -5,96 +5,91 @@ Requirements ------------ READemption was developed using Python 3.3 and for best performance -the user is advised to run READemptionL with this or a higher versoin, -too. Also Python 2.7 can be used if the library `futures +the user is advised to run READemption with this or a higher +version. Also Python 2.7 or earlier Python 3 version can be used if +the backported library `futures `_ is installed. In any case, -the third party modules `pysam `_ as +the third party packages `pysam `_ as well as `setuptool `_ and -`pip `_ in order to make the -installation easy by retrieving are required. READemption uses the +`pip `_ should be available on the +system in order to make the installation easy. READemption uses the short read mapper `segemehl `_ for the -mapping and this software needs to be installed. The subcommand -`viz_align`, `viz_gene_quanti`, `viz_deseq` require the Python library -`Matplotlib `_. `R +mapping and this software needs to be installed. The subcommands +``viz_align``, ``viz_gene_quanti``, ``viz_deseq`` require the Python +library `Matplotlib `_. `R `_ and the bioconductor package `DESeq2 `_ are -necessary for the subcommand `deseq` which performs differential gene -expression analysis. Don't worry - in the following the installation -of all these requirements will be covered. +necessary for the subcommand ``deseq`` which performs differential +gene expression analysis. Don't worry - in the following the +installation of all these requirements will be covered. -Installing on a fresh Ubuntu image ----------------------------------- +Installing on a fresh Ubuntu installation +----------------------------------------- -The following installation procedure was tested on a -`Amazon AWS t1.micro +The following installation procedure was tested on a `Amazon AWS +t1.micro `_ instance with Ubuntu Server 13.10 image. -Before starting it is a good idea to update the package list:: - - sudo apt-get update - -Ubuntu 13.10 has Python 3.3 already installed. If this is not the case -install:: - - sudo apt-get install python3 - -Install setuptools:: - - sudo apt-get install python3-setuptools -Install Matplotlib:: +1. Installing all required Debian/Ubuntu packages +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - sudo apt-get install python3-matplotlib - -Additionally, Cython is needed.:: - - sudo apt-get install cython3 - sudo apt-get install zlib1g-dev +Before starting it is a good idea to update the package list:: -If PIP is not yet install you should get this, too.:: + sudo apt-get update - curl https://raw.github.com/pypa/pip/master/contrib/get-pip.py > get-pip.py - sudo python3.3 get-pip.py +Now you can install the packages:: -Now you can use PIP to install pysam and READemption:: + sudo apt-get install python3 python3-setuptools python3-pip python3-matplotlib cython3 zlib1g-dev make libncurses5-dev r-base libxml2-dev - pip-3.3 install pysam - pip-3.3 install READemption +Some comments: -Install make and ncurses dev library.:: +- Ubuntu 13.10 should have Python 3.3 already installed. +- ``cython`` is required for ``pysam`` +- ``make``, ``libncurses5-dev`` and ``zlib1g-dev`` are needed for ``segemehl`` +- ``libxml2`` required for the installation of some of the R-packages - sudo apt-get install make - sudo apt-get install libncurses5-dev +2. Install segemehl +~~~~~~~~~~~~~~~~~~~ -Install segemehl.:: +:: - curl http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_1_6.tar.gz > segemehl_0_1_6.tar.gz - tar xzf segemehl_0_1_6.tar.gz + curl http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_1_7.tar.gz > segemehl_0_1_7.tar.gz + tar xzf segemehl_0_1_7.tar.gz cd segemehl_*/segemehl/ && make && cd ../../ -Copying it.:: +Copying it to a location that is part of the ``PATH`` e.g ``/usr/bin/`` ... + +:: - sudo cp segemehl*/segemehl/segemehl.x /usr/bin/segemehl + sudo cp segemehl_0_1_7/segemehl/segemehl.x /usr/bin/segemehl.x + sudo cp segemehl_0_1_7/segemehl/lack.x /usr/bin/lack.x -Alternative.:: +... or the bin folder of you home directory:: mkdir ~/bin cp segemehl_0_1_7/segemehl/segemehl.x ~/bin -Install R:: +3. Install DESeq2 +~~~~~~~~~~~~~~~~~ + +:: - sudo apt-get install r-base + echo 'source("http://bioconductor.org/biocLite.R");biocLite("DESeq2")' | sudo Rscript - -and libxml2 which is required for the installation of some R-packages.:: +Install pysam and READemption +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - sudo apt-get install libxml2-dev +Now you can use ``pip`` to install ``pysam`` and ``READemption``:: -Install DESeq2 in :: + sudo pip3 install pysam + sudo pip3 install READemption - echo 'source("http://bioconductor.org/biocLite.R");biocLite("DESeq2")' | Rscript - +Voilà! You should now be able to call READemption:: + reademption -h .. .. Global installation diff --git a/docs/source/subcommands.rst b/docs/source/subcommands.rst index f161edc5..9062534a 100644 --- a/docs/source/subcommands.rst +++ b/docs/source/subcommands.rst @@ -1,85 +1,102 @@ READemption's subcommands ========================= +In general the subcommands need at least one argument - the analysis +folder. If this is not given READemption assumes that the current +folder is the analysis folder. + create ------ -`create` generates the required folder structure for input and output -files which looks likes this: +``create`` generates the required folder structure for input and +output files. Once these folders are created the input files have to +be placed into the correct locations. As a minimal requirement, +RNA-Seqs reads in FASTA format (can be compressed with ``bzip2`` or +``gzip``) must be placed in ``input/reads`` and the reference sequence +in FASTA format must be copied or linked in +``input/reference_sequences``. For the command ``gene_quanti`` +annotation files in GFF3 format have to be put into +``input/annotations``. :: - ├── input - │   ├── annotation_files - │   ├── reads - │   └── reference_sequences - └── output - ├── align - │   ├── alignments - │   ├── index - │   ├── processed_reads - │   ├── reports_and_stats - │   │   ├── stats_data_json - │   │   └── used_rapl_version.txt - │   └── unaligned_reads - ├── coverage - │   ├── coverage-raw - │   ├── coverage-tnoar_mil_normalized - │   └── coverage-tnoar_min_normalized - ├── deseq - │   ├── deseq_raw - │   └── deseq_with_annotations - ├── gene_quanti - │   ├── gene_quanti_combined - │   └── gene_quanti_per_lib - ├── viz_align - ├── viz_deseq - └── viz_gene_quanti + usage: reademption create [-h] project_path + + positional arguments: + project_path Name/path of the project. + + optional arguments: + -h, --help show this help message and exit align ----- -`align` performs the clipping and size filtering of the reads, as well +``align`` performs the clipping and size filtering of the reads, as well as the actual aligning to the reference sequences. It also generates statistics about the steps (e.g. number of aligned reads, number of mappings). As the result of this steps are needed by the other -subcommands it has to be run before any other. It requires reads in +subcommands it has to be run before the others. It requires reads in FASTA format (or counterparts compressed with ``gzip`` or ``bzip2``) -and reference sequences in FASTA format (one sequence per -file). `align` generates the read alignment in BAM format (`*.bam`) -and also index files for those (`*.bai`). Is also stores unmapped +and reference sequences in FASTA format. ``align`` generates the read +alignments in BAM format (``*.bam``) and also index files for those +(``*.bam.bai``). Is also stores unmapped reads so that they can be +inspected e.g. to search for contaminations. The file +``output/align/reports_and_stats/read_alignment_stats.csv`` lists +several mapping statistics. The folder +``output/align/reports_and_stats/stats_data_json/`` contains files with +the original countings in JSON format. -=> report :: - positional arguments: - project_path Path of the project folder. If none is given the - current directory is used. - - optional arguments: - -h, --help show this help message and exit - --min_read_length MIN_READ_LENGTH, -l MIN_READ_LENGTH - Minimal read length after clipping - --processes PROCESSES, -p PROCESSES - Number of processes that should be used. - --segemehl_accuracy SEGEMEHL_ACCURACY, -a SEGEMEHL_ACCURACY - Segemehl's minimal accuracy (in %) (default 95). - --segemehl_evalue SEGEMEHL_EVALUE, -e SEGEMEHL_EVALUE - Segemehl's maximal e-value. (default 5.0) - --segemehl_bin SEGEMEHL_BIN, -s SEGEMEHL_BIN - Segemehl's binary path. - --split, -S Run segemehl with read splitting - --poly_a_clipping, -c - Perform polyA tail clipping. This option cannot be - used for paired-end reads. - --force, -f Overwrite existing files. - --progress, -P Show progress of the segemehl mapping. - --paired_end, -r Use this if reads are originating from a paired-end - sequencing. The members of a pair must be marked with - '_p1' and '_p2' in front of the file type suffixes - (e.g. 'my_sample_p1.fa' and 'my_sample_p2.fa' or - 'my_sample_p1.fa.bz2' and 'my_sample_p2.fa.bz2'). This - option cannot be use with polyA tail clipping. + usage: reademption align [-h] [--min_read_length MIN_READ_LENGTH] + [--processes PROCESSES] + [--segemehl_accuracy SEGEMEHL_ACCURACY] + [--segemehl_evalue SEGEMEHL_EVALUE] + [--segemehl_bin SEGEMEHL_BIN] [--paired_end] + [--split] [--poly_a_clipping] [--realign] + [--keep_original_alignments] [--lack_bin LACK_BIN] + [--check_for_existing_files] [--progress] + [project_path] + + positional arguments: + project_path Path of the project folder. If none is given the + current directory is used. + + optional arguments: + -h, --help show this help message and exit + --min_read_length MIN_READ_LENGTH, -l MIN_READ_LENGTH + Minimal read length after clipping. + --processes PROCESSES, -p PROCESSES + Number of processes that should be used. + --segemehl_accuracy SEGEMEHL_ACCURACY, -a SEGEMEHL_ACCURACY + Segemehl's minimal accuracy (in %) (default 95). + --segemehl_evalue SEGEMEHL_EVALUE, -e SEGEMEHL_EVALUE + Segemehl's maximal e-value (default 5.0). + --segemehl_bin SEGEMEHL_BIN, -s SEGEMEHL_BIN + Segemehl's binary path (default 'segemehl.x'). + --paired_end, -P Use this if reads are originating from a paired-end + sequencing. The members of a pair must be marked with + '_p1' and '_p2' in front of the file type suffixes + (e.g. 'my_sample_p1.fa' and 'my_sample_p2.fa' or + 'my_sample_p1.fa.bz2' and 'my_sample_p2.fa.bz2'). This + option cannot be use with polyA tail clipping. + --split, -S Run segemehl with read splitting. + --poly_a_clipping, -c + Perform polyA tail clipping. This option cannot be + used for paired-end reads. + --realign, -r Perform realignment of unmapped reads using 'lack'. + --keep_original_alignments, -k + Only used with --realign/-r. Keep the alignment file + of the primary mapper (segemehl) and the realigner + (lack) after merging. + --lack_bin LACK_BIN, -L LACK_BIN + Lack's binary path (default 'lack.x'). + --check_for_existing_files, -f + Check for existing files (e.g. from a interrupted + previous run) and do not overwrite them if they exits. + Attention! You have to take care that there are no + partially generated files left! + --progress, -g Show progress of the segemehl mapping. coverage -------- @@ -90,114 +107,204 @@ read alignments. These wiggle files can be viewed in common genome browser like the `Integrated genome browser (IGB) `_ or the `Integrative genome viewer (IGV) `_. Three sets of wiggle -files will be generated: raw couting values without normalization +files will be generated: raw counting values without normalization (located in the folder `coverage-raw`), normalized by the total number -of aligned reads (abriviated as tnoar) and the multiplied by the +of aligned reads (abbreviated as tnoar) and the multiplied by the lowest number of aligned reads of all considered libraries (in folder `coverage-tnoar_min_normalized`) as well as normalized by the total number of aligned reads and multiplied by one million -(`coverage-tnoar_mil_normalized`). The different normalisations make a +(`coverage-tnoar_mil_normalized`). The different normalizations make a visual semi-quantitative comparative possible and enable to perform -transcription start site analysis (e.g. using tools like `TSSAR -`_). For each library and set -there will be coverage files for the forward and the reverse -strand. The coverages for the forward strand have positive while the -one for the reverse stand have negative values in order to make a -visual discrimanation easy. Per default all reads and each position of -them will be considered. To calculate the coverages only based on -uniquely aligned read use the ``--unique_only`` parameter. If only the -first base should be considered add ``--first_base_only``. Reads are -aligned to multiple location will account only in fraction to the -values of the different positions. For example a read that is mapped -to three different location will contribute a value of 1/3 to each of -the nucleotiedes of these positions. To turn off this behavior use +transcription start site analysis (e.g. using tools like `TSSPredator +`_). For +each library and set there will be coverage files for the forward and +the reverse strand. The coverages for the forward strand have positive +values while the one for the reverse stand have negative values in +order to make a visual discrimination easy. Per default all reads and +each position of them will be considered. To calculate the coverages +only based on uniquely aligned read use the ``--unique_only`` +parameter. If only the first base should be considered add +``--first_base_only``. Reads are aligned to multiple location will +account only in fraction to the values of the different positions. For +example a read that is mapped to three different location will +contribute a value of 1/3 to each of the nucleotiedes of these +positions. To turn off this behavior use ``--skip_read_count_splitting``. :: - positional arguments: - project_path Path of the project folder. If none is given the - current directory is used. - - optional arguments: - -h, --help show this help message and exit - --unique_only, -u Use uniquely aligned reads only. - --processes PROCESSES, -p PROCESSES - Number of processes that should be used. - --skip_read_count_splitting, -s - Do not split the read counting between different - alignings. Default is to do the splitting. - --first_base_only, -b - Only the first bases 5' base of each read aligning is - taken into account. - --force, -f Overwrite existing files. + usage: reademption coverage [-h] [--unique_only] [--normalize_by_uniquely] + [--processes PROCESSES] + [--skip_read_count_splitting] [--first_base_only] + [--check_for_existing_files] + [project_path] + + positional arguments: + project_path Path of the project folder. If none is given the + current directory is used. + + optional arguments: + -h, --help show this help message and exit + --unique_only, -u Use uniquely aligned reads only. + --normalize_by_uniquely, -U + Normalize by the number of uniquely aligned reads. By + default the normalization is done based on the total + number of aligned reads even if only uniquely aligned + reads are used for the coverage calculation. + --processes PROCESSES, -p PROCESSES + Number of processes that should be used. + --skip_read_count_splitting, -s + Do not split the read counting between different + alignings. Default is to do the splitting. + --first_base_only, -b + Only the first bases 5' base of each read aligning is + taken into account. + --check_for_existing_files, -f + Check for existing files (e.g. from a interrupted + previous run) and do not overwrite them if they exits. + Attention! You have to take care that there are no + partially generated files left! gene_quanti ----------- -With `gene_quanti` the number of reads to each gene is counted and the -results are combined in tables. +With ``gene_quanti`` the number of reads to each annotation entry is +counted and the results are combined in tables. At least one GGF file +with the annotations have to be placed in ``input/annotations``. The +sequence ID of the sequenced must be precisely the same as the IDs +used in the reference sequence FASTA files. To specify the feature +classes (e.g. CDS, gene, rRNA, tRNA) that should be quantified the +parameter ``--features`` can be used. Otherwise countings for all +annotation entries are generated. Per default sense and anti-sense +overlaps are counted and separately listed. -- IDs must be the same :: - positional arguments: - project_path Path of the project folder. If none is given the - current directory is used. - - optional arguments: - -h, --help show this help message and exit - --min_overlap MIN_OVERLAP, -o MIN_OVERLAP - Minimal read-annotation-overlap (in nt) (default 1) - --skip_norm_by_alignment_freq - --skip_norm_by_overlap_freq - --skip_antisense, -a - --processes PROCESSES, -p PROCESSES - Number of processes that should be used. - --features ALLOWED_FEATURES, -t ALLOWED_FEATURES - Comma separated list of features that should be - considered (e.g. gene, cds, region, exon). Other - feature will be skipped. If not specified all features - will be considered. - --unique_only, -u Use uniquely aligned reads only. - --pseudocounts, -c Add a pseudocount of 1 to each gene. - --force, -f Overwrite existing files. + usage: reademption gene_quanti [-h] [--min_overlap MIN_OVERLAP] + [--no_count_split_by_alignment_no] + [--no_count_splitting_by_gene_no] + [--skip_antisense] [--processes PROCESSES] + [--features ALLOWED_FEATURES] [--unique_only] + [--pseudocounts] [--check_for_existing_files] + [project_path] + + positional arguments: + project_path Path of the project folder. If none is given the + current directory is used. + + optional arguments: + -h, --help show this help message and exit + --min_overlap MIN_OVERLAP, -o MIN_OVERLAP + Minimal read-annotation-overlap (in nt) (default 1). + --no_count_split_by_alignment_no, -n + Do not split read countings by the number of + alignments a read has. By default this count splitting + is performed. + --no_count_splitting_by_gene_no, -l + Do not split read countings by the number of genes it + overlaps with. By default this count splitting is + performed. + --skip_antisense, -a Do not count anti-sense read-gene-overlaps. By default + sense and anti-sense overlaps are counted and + separately reported. + --processes PROCESSES, -p PROCESSES + Number of processes that should be used. + --features ALLOWED_FEATURES, -t ALLOWED_FEATURES + Comma separated list of features that should be + considered (e.g. gene, cds, region, exon). Other + feature will be skipped. If not specified all features + will be considered. + --unique_only, -u Use uniquely aligned reads only. + --pseudocounts, -c Add a pseudocount of 1 to each gene. + --check_for_existing_files, -f + Check for existing files (e.g. from a interrupted + previous run) and do not overwrite them if they exits. + Attention! You have to take care that there are no + partially generated files left! deseq ----- -Differential gene expression can be performed using `deseq` which will -run a `DESeq `_ analyses for all possible combinations. +Differential gene expression can be performed using ``deseq`` which +will run a `DESeq2 `_ +analyses for all possible combinations of conditions. To allocated the +conditions to the libraries use the ``--libs`` and ``--conditions`` +parameters (e.g. ``--libs +SamA_R1.fa,SamA_R2.fa,SamB_R1.fa,SamB_R2.fa --conditions +SamA,SamA,SamB,SamB``). :: - positional arguments: - project_path Path of the project folder. If none is given the - current directory is used. - - optional arguments: - -h, --help show this help message and exit - --libs LIBS, -l LIBS Comma separated list of libraries. - --conditions CONDITIONS, -c CONDITIONS - Comma separated list of condition in the same order as - their corresponding libraries. - --no_replicates, -r + usage: reademption deseq [-h] --libs LIBS --conditions CONDITIONS + [--cooks_cutoff_off] + [project_path] + + positional arguments: + project_path Path of the project folder. If none is given the + current directory is used. + + optional arguments: + -h, --help show this help message and exit + --libs LIBS, -l LIBS Comma separated list of libraries. + --conditions CONDITIONS, -c CONDITIONS + Comma separated list of condition in the same order as + their corresponding libraries. + --cooks_cutoff_off, -k viz_align --------- -`viz_align` plots histograms of the read length distributions of the +``viz_align`` plots histograms of the read length distributions of the reads before and after the read clipping. +:: + + usage: reademption viz_align [-h] [project_path] + + positional arguments: + project_path Path of the project folder. If none is given the current + directory is used. + + optional arguments: + -h, --help show this help message and exit + viz_gene_quanti --------------- -`viz_gene_quanti` creates scatterplots in with the raw gene wise +``viz_gene_quanti`` creates scatterplots in which the raw gene wise quantification values are compared for each library pair (all-against-all). For each comparison the `pearson correllation `_ -(`r`) coefficiant is. +(`r`) coefficiant is. Additionally, bar charts that visualize the +distribution of the read counting of the different annotation classes +are plotted. + +:: + + usage: reademption viz_gene_quanti [-h] [project_path] + + positional arguments: + project_path Path of the project folder. If none is given the current + directory is used. + + optional arguments: + -h, --help show this help message and exit viz_deseq --------- + +``viz_deseq`` generates MA-plots of the comparison (log2 fold changes +vs. the base mean) as well as volcano plots (log2 fold changes +vs. p-values / adjusted p-values). + +:: + + usage: reademption viz_deseq [-h] [project_path] + + positional arguments: + project_path Path of the project folder. If none is given the current + directory is used. + + optional arguments: + -h, --help show this help message and exit From 651328f0084bd2700c6e733b4ed4d39258fa8283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Mon, 31 Mar 2014 22:51:33 +0200 Subject: [PATCH 10/11] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6d6feec..de12f258 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ reads (as introduced by Sharma et al., Nature, 2010 originating from bacterial samples. Meanwhile is has been extended to process data generated in different experimental setups and originating from all domains of life and is under active development. The subcommands which -are provided by command-line interface cover read processing and +are accessible viaq command-line interface cover read processing and aligning, coverage plot generation, gene expression quantification as well as differential gene expression analysis. READemption was applied to analyze numerous data sets. In order to set up analyses quickly From ab34972fd98e7e51a0e4898e35e97a2ca88983ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Mon, 31 Mar 2014 22:59:50 +0200 Subject: [PATCH 11/11] Set version to 0.2.1 --- CHANGELOG.txt | 4 ++++ bin/reademption | 2 +- docs/source/conf.py | 2 +- setup.py | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index bb2bc110..5e11b424 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,7 @@ +v0.2.1 (2014-03-31) +- Improve documentation +- Add badges and further info to Readme +- Extend Makefile v0.2.0 (2014-03-29) - Switch to DESeq - Renaming from TRAPL to READemption (yes, another name change) diff --git a/bin/reademption b/bin/reademption index 7558667e..bd7f708c 100755 --- a/bin/reademption +++ b/bin/reademption @@ -9,7 +9,7 @@ __author__ = "Konrad Foerstner " __copyright__ = "2011-2013 by Konrad Foerstner " __license__ = "ISC license" __email__ = "konrad@foerstner.org" -__version__ = "0.2.1dev" +__version__ = "0.2.1" def main(): parser = argparse.ArgumentParser() diff --git a/docs/source/conf.py b/docs/source/conf.py index 7cdc11d5..88a092a8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -50,7 +50,7 @@ # The short X.Y version. version = '0.2' # The full version, including alpha/beta/rc tags. -release = '0.2.1dev' +release = '0.2.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 4c5477b6..3ef091d6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='READemption', - version='0.2.1dev', + version='0.2.1', packages=['reademptionlib', 'tests'], author='Konrad U. Förstner', author_email='konrad@foerstner.org',