From ae4d058a75f657c5c28bea0e6ac66054287e665a Mon Sep 17 00:00:00 2001 From: Joel Berendzen Date: Mon, 7 Dec 2020 17:41:00 -0700 Subject: [PATCH 1/6] read features that end with ; --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 80ccda7..c5511d9 100644 --- a/.gitignore +++ b/.gitignore @@ -62,3 +62,6 @@ target/ # pyenv python configuration file .python-version + +# virtual environment +venv/ From d271417250cee3f60a8556a118ea8780be3cacd0 Mon Sep 17 00:00:00 2001 From: Joel Berendzen Date: Mon, 7 Dec 2020 17:46:27 -0700 Subject: [PATCH 2/6] read features bug --- gffpandas/gffpandas.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/gffpandas/gffpandas.py b/gffpandas/gffpandas.py index 1df09c8..37932e1 100644 --- a/gffpandas/gffpandas.py +++ b/gffpandas/gffpandas.py @@ -5,6 +5,10 @@ def read_gff3(input_file): return Gff3DataFrame(input_file) +def _split_atts(atts): + """Split a feature string into attributes.""" + splits_list = [a.split("=") for a in atts.split(";") if "=" in a] + return {l[0]:"=".join(l[1:]) for l in splits_list} class Gff3DataFrame(object): """This class contains header information in the header attribute and @@ -181,14 +185,7 @@ def attributes_to_columns(self) -> pd.DataFrame: """ attribute_df = self.df.copy() df_attributes = attribute_df.loc[:, "seq_id":"attributes"] - attribute_df["at_dic"] = attribute_df.attributes.apply( - lambda attributes: dict( - [ - key_value_pair.split(sep="=", maxsplit=1) - for key_value_pair in attributes.split(";") - ] - ) - ) + attribute_df["at_dic"] = attribute_df.attributes.apply(_split_atts) attribute_df["at_dic_keys"] = attribute_df["at_dic"].apply( lambda at_dic: list(at_dic.keys()) ) From 003c5891ea742077f1e94db61e19bfb1671222da Mon Sep 17 00:00:00 2001 From: Joel Berendzen Date: Mon, 7 Dec 2020 17:51:31 -0700 Subject: [PATCH 3/6] flake8 --- gffpandas/gffpandas.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/gffpandas/gffpandas.py b/gffpandas/gffpandas.py index 37932e1..ecb2819 100644 --- a/gffpandas/gffpandas.py +++ b/gffpandas/gffpandas.py @@ -5,10 +5,12 @@ def read_gff3(input_file): return Gff3DataFrame(input_file) + def _split_atts(atts): """Split a feature string into attributes.""" splits_list = [a.split("=") for a in atts.split(";") if "=" in a] - return {l[0]:"=".join(l[1:]) for l in splits_list} + return {item[0]: "=".join(item[1:]) for item in splits_list} + class Gff3DataFrame(object): """This class contains header information in the header attribute and @@ -17,7 +19,9 @@ class Gff3DataFrame(object): """ - def __init__(self, input_gff_file=None, input_df=None, input_header=None) -> None: + def __init__( + self, input_gff_file=None, input_df=None, input_header=None + ) -> None: """Create an instance.""" if input_gff_file is not None: self._gff_file = input_gff_file @@ -150,7 +154,9 @@ def filter_feature_of_type(self, feature_type_list) -> "Gff3DataFrame": feature_df = self.df.loc[self.df.type.isin(feature_type_list)] return Gff3DataFrame(input_df=feature_df, input_header=self.header) - def filter_by_length(self, min_length=None, max_length=None) -> "Gff3DataFrame": + def filter_by_length( + self, min_length=None, max_length=None + ) -> "Gff3DataFrame": """Filtering the pandas dataframe by the gene_length. For this method the desired minimal and maximal bp length @@ -169,7 +175,9 @@ def filter_by_length(self, min_length=None, max_length=None) -> "Gff3DataFrame": filtered_by_length = self.df[ (gene_length >= min_length) & (gene_length <= max_length) ] - return Gff3DataFrame(input_df=filtered_by_length, input_header=self.header) + return Gff3DataFrame( + input_df=filtered_by_length, input_header=self.header + ) def attributes_to_columns(self) -> pd.DataFrame: """Saving each attribute-tag to a single column. @@ -199,7 +207,9 @@ def attributes_to_columns(self) -> pd.DataFrame: ) return df_attributes - def get_feature_by_attribute(self, attr_tag, attr_value_list) -> "Gff3DataFrame": + def get_feature_by_attribute( + self, attr_tag, attr_value_list + ) -> "Gff3DataFrame": """Filtering the pandas dataframe by a attribute. The 9th column of a gff3-file contains the list of feature @@ -223,8 +233,12 @@ def get_feature_by_attribute(self, attr_tag, attr_value_list) -> "Gff3DataFrame" """ df_copy = self.df.copy() attribute_df = Gff3DataFrame.attributes_to_columns(self) - filtered_by_attr_df = df_copy.loc[attribute_df[attr_tag].isin(attr_value_list)] - return Gff3DataFrame(input_df=filtered_by_attr_df, input_header=self.header) + filtered_by_attr_df = df_copy.loc[ + attribute_df[attr_tag].isin(attr_value_list) + ] + return Gff3DataFrame( + input_df=filtered_by_attr_df, input_header=self.header + ) def stats_dic(self) -> dict: """Gives the following statistics for the data: @@ -317,7 +331,9 @@ def overlaps_with( overlap_df = overlap_df[~condition] return Gff3DataFrame(input_df=overlap_df, input_header=self.header) - def find_duplicated_entries(self, seq_id=None, type=None) -> "Gff3DataFrame": + def find_duplicated_entries( + self, seq_id=None, type=None + ) -> "Gff3DataFrame": """Find entries which are redundant. For this method the chromosom accession number (seq_id) as well as the @@ -335,5 +351,7 @@ def find_duplicated_entries(self, seq_id=None, type=None) -> "Gff3DataFrame": """ input_df = self.df[self.df.seq_id == seq_id] df_feature = input_df[input_df.type == type] - duplicate = df_feature.loc[df_feature[["end", "start", "strand"]].duplicated()] + duplicate = df_feature.loc[ + df_feature[["end", "start", "strand"]].duplicated() + ] return Gff3DataFrame(input_df=duplicate, input_header=self.header) From 78349d27ee62437297f95eeffed2f6e2a69c71fd Mon Sep 17 00:00:00 2001 From: Joel Berendzen Date: Tue, 8 Dec 2020 17:06:14 -0700 Subject: [PATCH 4/6] travis, testing in tmp, human GFF --- .gitignore | 3 + .travis.yml | 33 +- fixtures | 1 - pytest.ini | 1 - requirements_dev.txt | 13 - tests/__init__.py | 21 +- tests/gffpandas | 1 - tests/pytest.ini | 3 - tests/test_gffpandas.py | 2079 ++++++++++++++------ tests/{fixtures => testdata}/test_file.gff | 0 travis_pypi_setup.py | 127 -- 11 files changed, 1537 insertions(+), 745 deletions(-) delete mode 120000 fixtures delete mode 120000 pytest.ini delete mode 100644 requirements_dev.txt delete mode 120000 tests/gffpandas delete mode 100644 tests/pytest.ini rename tests/{fixtures => testdata}/test_file.gff (100%) delete mode 100644 travis_pypi_setup.py diff --git a/.gitignore b/.gitignore index c5511d9..a3bc999 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,6 @@ target/ # virtual environment venv/ + +# test data +tests/data/ diff --git a/.travis.yml b/.travis.yml index 078318a..7cb10f5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,26 +1,15 @@ -# Config file for automatic testing at travis-ci.org -# This file will be regenerated if you run travis_pypi_setup.py - +dist: focal +sudo: false language: python +cache: + directories: + - $HOME/.cache/pip + - $HOME/tests/data python: - - 3.6 - -# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors + - "3.8" install: - - "pip install -r requirements_dev.txt" - -# command to run tests -script: cd tests && pytest && cd .. + - pip install -r requirements.txt + - pip install -v pytest pytest-cov pytest-datadir-mgr +script: + - pytest --cov=gffpandas -s -# After you create the Github repo and add it to Travis, run the -# travis_pypi_setup.py script to finish PyPI deployment setup -# deploy: -# provider: pypi -# distributions: sdist bdist_wheel -# user: konrad -# password: -# secure: PLEASE_REPLACE_ME -# on: -# tags: true -# repo: konrad/pandasgff -# python: 2.7 diff --git a/fixtures b/fixtures deleted file mode 120000 index da36c9f..0000000 --- a/fixtures +++ /dev/null @@ -1 +0,0 @@ -tests/fixtures/ \ No newline at end of file diff --git a/pytest.ini b/pytest.ini deleted file mode 120000 index f6a7c13..0000000 --- a/pytest.ini +++ /dev/null @@ -1 +0,0 @@ -tests/pytest.ini \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index 2efd332..0000000 --- a/requirements_dev.txt +++ /dev/null @@ -1,13 +0,0 @@ -pip>=18.0 -bumpversion==0.5.3 -wheel==0.31.0 -watchdog==0.8.3 -flake8==3.5.0 -tox==3.1.2 -coverage==4.5.1 -Sphinx==1.7.6 -cryptography==3.2 -PyYAML==5.1 -pytest==3.6.3 -pytest-runner==4.2 -pandas>=1.0.0 diff --git a/tests/__init__.py b/tests/__init__.py index e47dbac..13a9598 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,22 @@ # -*- coding: utf-8 -*- - """Unit test package for gffpandas.""" + +# standard library imports +import functools + + +def print_docstring(): + """Decorator to print a docstring.""" + + def decorator(func): + """Define decorator""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + """Print docstring and call function""" + print(func.__doc__) + return func(*args, **kwargs) + + return wrapper + + return decorator diff --git a/tests/gffpandas b/tests/gffpandas deleted file mode 120000 index 8208e1a..0000000 --- a/tests/gffpandas +++ /dev/null @@ -1 +0,0 @@ -../gffpandas/ \ No newline at end of file diff --git a/tests/pytest.ini b/tests/pytest.ini deleted file mode 100644 index d776a2b..0000000 --- a/tests/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -filterwarnings = - ignore::UserWarning \ No newline at end of file diff --git a/tests/test_gffpandas.py b/tests/test_gffpandas.py index d9884c8..f2ec4a4 100644 --- a/tests/test_gffpandas.py +++ b/tests/test_gffpandas.py @@ -3,591 +3,1518 @@ """Tests for `gffpandas` package.""" +# standard library imports +import time + +# first-party imports import gffpandas.gffpandas as gff3pd + +# third-party imports import pandas as pd -import os - - -written_df = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'region', 1, 4000, '.', '+', '.', - 'Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=' - 'genomic DNA;serovar=Typhimurium;strain=SL1344'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001'], - ['NC_016810.1', 'RefSeq', 'CDS', 13, 235, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=' - 'cds0;Name=YP_005179941.1;Parent=gene1;gbkey=CDS;product=thr operon' - ' leader peptide;protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002'], - ['NC_016810.1', 'RefSeq', 'CDS', 341, 523, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=' - 'cds0;Name=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon' - ' leader peptide;protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 600, '.', '-', '.', - 'ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003'], - ['NC_016810.1', 'RefSeq', 'CDS', 21, 345, '.', '-', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=' - 'cds0;Name=YP_005179941.1;Parent=gene3;gbkey=CDS;product=thr operon' - ' leader peptide;protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 41, 255, '.', '+', '.', - 'ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004'], - ['NC_016810.1', 'RefSeq', 'CDS', 61, 195, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=' - 'cds0;Name=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon' - ' leader peptide;protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 170, 546, '.', '+', '.', - 'ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005'], - ['NC_016810.1', 'RefSeq', 'CDS', 34, 335, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=' - 'cds0;Name=YP_005179941.1;Parent=gene5;gbkey=CDS;product=thr operon' - ' leader peptide;protein_id=YP_005179941.1;transl_table=11'], - ], columns=["seq_id", "source", "type", "start", "end", - "score", "strand", "phase", "attributes"]) - -written_header = ('##gff-version 3\n' - '##sequence-region NC_016810.1 1 20\n') - - -written_csv = ('seq_id,source,type,start,end,score,strand,phase,attributes\n' - 'NC_016810.1,RefSeq,region,1,4000,.,+,.,Dbxref=taxon:216597;ID=' - 'id0;gbkey=Src;genome=genomic;mol_type=genomic DNA;serovar=' - 'Typhimurium;strain=SL1344\n' - 'NC_016810.1,RefSeq,gene,1,20,.,+,.,ID=gene1;Name=thrL;gbkey=' - 'Gene;gene=thrL;locus_tag=SL1344_0001\n' - 'NC_016810.1,RefSeq,CDS,13,235,.,+,0,Dbxref=UniProtKB%252FTr' - 'EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799' - '41.1;Parent=gene1;gbkey=CDS;product=thr operon leader peptide;' - 'protein_id=YP_005179941.1;transl_table=11\n' - 'NC_016810.1,RefSeq,gene,1,20,.,+,.,ID=gene2;Name=thrA;gbkey=' - 'Gene;gene=thrA;locus_tag=SL1344_0002\n' - 'NC_016810.1,RefSeq,CDS,341,523,.,+,0,Dbxref=UniProtKB%252FTr' - 'EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799' - '41.1;Parent=gene2;gbkey=CDS;product=thr operon leader peptide;' - 'protein_id=YP_005179941.1;transl_table=11\n' - 'NC_016810.1,RefSeq,gene,1,600,.,-,.,ID=gene3;Name=thrX;gbkey=' - 'Gene;gene=thrX;locus_tag=SL1344_0003\n' - 'NC_016810.1,RefSeq,CDS,21,345,.,-,0,Dbxref=UniProtKB%252FTr' - 'EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799' - '41.1;Parent=gene3;gbkey=CDS;product=thr operon leader peptide;' - 'protein_id=YP_005179941.1;transl_table=11\n' - 'NC_016810.1,RefSeq,gene,41,255,.,+,.,ID=gene4;Name=thrB;gbkey=' - 'Gene;gene=thrB;locus_tag=SL1344_0004\n' - 'NC_016810.1,RefSeq,CDS,61,195,.,+,0,Dbxref=UniProtKB%252FTr' - 'EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799' - '41.1;Parent=gene4;gbkey=CDS;product=thr operon leader peptide;' - 'protein_id=YP_005179941.1;transl_table=11\n' - 'NC_016810.1,RefSeq,gene,170,546,.,+,.,ID=gene5;Name=thrC;gbkey' - '=Gene;gene=thrC;locus_tag=SL1344_0005\n' - 'NC_016810.1,RefSeq,CDS,34,335,.,+,0,Dbxref=UniProtKB%252FTr' - 'EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799' - '41.1;Parent=gene5;gbkey=CDS;product=thr operon leader peptide;' - 'protein_id=YP_005179941.1;transl_table=11\n') - -written_tsv = ('seq_id\tsource\ttype\tstart\tend\tscore\tstrand\tphase\t' - 'attributes\n' - 'NC_016810.1\tRefSeq\tregion\t1\t4000\t.\t+\t.\tDbxref=taxon:21' - '6597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic DNA;' - 'serovar=Typhimurium;strain=SL1344\n' - 'NC_016810.1\tRefSeq\tgene\t1\t20\t.\t+\t.\tID=gene1;Name=thrL;' - 'gbkey=Gene;gene=thrL;locus_tag=SL1344_0001\n' - 'NC_016810.1\tRefSeq\tCDS\t13\t235\t.\t+\t0\tDbxref=UniProtKB%2' - '52FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051' - '79941.1;Parent=gene1;gbkey=CDS;product=thr operon leader ' - 'peptide;protein_id=YP_005179941.1;transl_table=11\n' - 'NC_016810.1\tRefSeq\tgene\t1\t20\t.\t+\t.\tID=gene2;Name=thrA;' - 'gbkey=Gene;gene=thrA;locus_tag=SL1344_0002\n' - 'NC_016810.1\tRefSeq\tCDS\t341\t523\t.\t+\t0\tDbxref=UniProtKB%' - '252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_005' - '179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader ' - 'peptide;protein_id=YP_005179941.1;transl_table=11\n' - 'NC_016810.1\tRefSeq\tgene\t1\t600\t.\t-\t.\tID=gene3;Name=thrX' - ';gbkey=Gene;gene=thrX;locus_tag=SL1344_0003\n' - 'NC_016810.1\tRefSeq\tCDS\t21\t345\t.\t-\t0\tDbxref=UniProtKB%2' - '52FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051' - '79941.1;Parent=gene3;gbkey=CDS;product=thr operon leader ' - 'peptide;protein_id=YP_005179941.1;transl_table=11\n' - 'NC_016810.1\tRefSeq\tgene\t41\t255\t.\t+\t.\tID=gene4;Name=' - 'thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004\n' - 'NC_016810.1\tRefSeq\tCDS\t61\t195\t.\t+\t0\tDbxref=UniProtKB%2' - '52FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051' - '79941.1;Parent=gene4;gbkey=CDS;product=thr operon leader ' - 'peptide;protein_id=YP_005179941.1;transl_table=11\n' - 'NC_016810.1\tRefSeq\tgene\t170\t546\t.\t+\t.\tID=gene5;Name=' - 'thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005\n' - 'NC_016810.1\tRefSeq\tCDS\t34\t335\t.\t+\t0\tDbxref=UniProt' - 'KB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=' - 'YP_005179941.1;Parent=gene5;gbkey=CDS;product=thr operon ' - 'leader peptide;protein_id=YP_005179941.1;transl_table=11\n') - -written_gff = ('##gff-version 3\n' - '##sequence-region NC_016810.1 1 20\n' - 'NC_016810.1 RefSeq region 1 4000 . +' - ' . Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=ge' - 'nomic;mol_type=genomic DNA;serovar=Typhimurium;strain=SL1344\n' - 'NC_016810.1 RefSeq gene 1 20 . +' - ' . ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_' - 'tag=SL1344_0001\n' - 'NC_016810.1 RefSeq CDS 13 235 . +' - ' 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y' - 'P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene1;gbkey=C' - 'DS;product=thr operon leader peptide;protein_id=YP_005179941.1' - ';transl_table=11\n' - 'NC_016810.1 RefSeq gene 1 20 . +' - ' . ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_' - 'tag=SL1344_0002\n' - 'NC_016810.1 RefSeq CDS 341 523 . +' - ' 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y' - 'P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene2;gbkey=C' - 'DS;product=thr operon leader peptide;protein_id=YP_005179941.1' - ';transl_table=11\n' - 'NC_016810.1 RefSeq gene 1 600 . -' - ' . ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_' - 'tag=SL1344_0003\n' - 'NC_016810.1 RefSeq CDS 21 345 . -' - ' 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y' - 'P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene3;gbkey=C' - 'DS;product=thr operon leader peptide;protein_id=YP_005179941.1' - ';transl_table=11\n' - 'NC_016810.1 RefSeq gene 41 255 . +' - ' . ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_' - 'tag=SL1344_0004\n' - 'NC_016810.1 RefSeq CDS 61 195 . +' - ' 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y' - 'P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene4;gbkey=C' - 'DS;product=thr operon leader peptide;protein_id=YP_005179941.1' - ';transl_table=11\n' - 'NC_016810.1 RefSeq gene 170 546 . +' - ' . ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_' - 'tag=SL1344_0005\n' - 'NC_016810.1 RefSeq CDS 34 335 . +' - ' 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y' - 'P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene5;gbkey=C' - 'DS;product=thr operon leader peptide;protein_id=YP_005179941.1' - ';transl_table=11\n') - - -written_filtered_length = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001'], - ['NC_016810.1', 'RefSeq', 'CDS', 13, 235, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene1;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002'], - ['NC_016810.1', 'RefSeq', 'CDS', 341, 523, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 41, 255, '.', '+', '.', - 'ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004'], - ['NC_016810.1', 'RefSeq', 'CDS', 61, 195, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ], columns=["seq_id", "source", "type", "start", "end", - "score", "strand", "phase", "attributes"], - index=[1, 2, 3, 4, 7, 8]) - -compare_get_feature_by_attribute = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 600, '.', '-', '.', - 'ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003'], - ['NC_016810.1', 'RefSeq', 'gene', 41, 255, '.', '+', '.', - 'ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004'], - ['NC_016810.1', 'RefSeq', 'gene', 170, 546, '.', '+', '.', - 'ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005'], - ], columns=["seq_id", "source", "type", "start", "end", - "score", "strand", "phase", "attributes"], - index=[1, 3, 5, 7, 9]) - -compare_get_feature_by_attribute2 = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'CDS', 341, 523, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'CDS', 21, 345, '.', '-', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=' - 'cds0;Name=YP_005179941.1;Parent=gene3;gbkey=CDS;product=thr operon' - ' leader peptide;protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'CDS', 61, 195, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=' - 'cds0;Name=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon' - ' leader peptide;protein_id=YP_005179941.1;transl_table=11'], - ], columns=["seq_id", "source", "type", "start", "end", - "score", "strand", "phase", "attributes"], - index=[4, 6, 8]) - - -written_attribute_df = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'region', 1, 4000, '.', '+', '.', - 'Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic' - ' DNA;serovar=Typhimurium;strain=SL1344', - 'taxon:216597', 'id0', None, None, 'Src', None, 'genomic', - None, 'genomic DNA', None, None, 'Typhimurium', 'SL1344', - None], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001', - None, 'gene1', 'thrL', None, 'Gene', 'thrL', None, - 'SL1344_0001', None, None, None, None, None, None], - ['NC_016810.1', 'RefSeq', 'CDS', 13, 235, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;' - 'Name=YP_005179941.1;Parent=gene1;gbkey=CDS;product=thr operon leader' - ' peptide;protein_id=YP_005179941.1;transl_table=11', - 'UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1', - 'cds0', 'YP_005179941.1', 'gene1', 'CDS', None, None, - None, None, 'thr operon leader peptide', - 'YP_005179941.1', None, None, '11'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002', - None, 'gene2', 'thrA', None, 'Gene', 'thrA', None, - 'SL1344_0002', None, None, None, None, None, None], - ['NC_016810.1', 'RefSeq', 'CDS', 341, 523, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;' - 'Name=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader' - ' peptide;protein_id=YP_005179941.1;transl_table=11', - 'UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1', 'cds0', - 'YP_005179941.1', 'gene2', 'CDS', None, None, None, None, - 'thr operon leader peptide', - 'YP_005179941.1', None, None, '11'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 600, '.', '-', '.', - 'ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003', - None, 'gene3', 'thrX', None, 'Gene', 'thrX', None, - 'SL1344_0003', None, None, None, None, None, None], - ['NC_016810.1', 'RefSeq', 'CDS', 21, 345, '.', '-', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;' - 'Name=YP_005179941.1;Parent=gene3;gbkey=CDS;product=thr operon leader' - ' peptide;protein_id=YP_005179941.1;transl_table=11', - 'UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1', 'cds0', - 'YP_005179941.1', 'gene3', 'CDS', None, None, None, None, - 'thr operon leader peptide', - 'YP_005179941.1', None, None, '11'], - ['NC_016810.1', 'RefSeq', 'gene', 41, 255, '.', '+', '.', - 'ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004', - None, 'gene4', 'thrB', None, 'Gene', 'thrB', None, - 'SL1344_0004', None, None, None, None, None, None], - ['NC_016810.1', 'RefSeq', 'CDS', 61, 195, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;' - 'Name=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon leader' - ' peptide;protein_id=YP_005179941.1;transl_table=11', - 'UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1', 'cds0', - 'YP_005179941.1', 'gene4', 'CDS', None, None, None, None, - 'thr operon leader peptide', - 'YP_005179941.1', None, None, '11'], - ['NC_016810.1', 'RefSeq', 'gene', 170, 546, '.', '+', '.', - 'ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005', - None, 'gene5', 'thrC', None, 'Gene', 'thrC', None, - 'SL1344_0005', None, None, None, None, None, None], - ['NC_016810.1', 'RefSeq', 'CDS', 34, 335, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;' - 'Name=YP_005179941.1;Parent=gene5;gbkey=CDS;product=thr operon leader' - ' peptide;protein_id=YP_005179941.1;transl_table=11', - 'UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1', 'cds0', - 'YP_005179941.1', 'gene5', 'CDS', None, None, None, None, - 'thr operon leader peptide', - 'YP_005179941.1', None, None, '11'], - ], columns=["seq_id", "source", "type", "start", "end", - "score", "strand", "phase", "attributes", "Dbxref", - "ID", "Name", "Parent", "gbkey", "gene", "genome", - "locus_tag", "mol_type", "product", "protein_id", - "serovar", "strain", "transl_table"]) - - -strand_counts = pd.value_counts(written_df['strand']).to_dict() -type_counts = pd.value_counts(written_df['type']).to_dict() + +# module imports +from . import print_docstring + +# global constants +REFSEQ_URL = ( + "https://ftp.ncbi.nih.gov/genomes/refseq/vertebrate_mammalian" + + "/Homo_sapiens/annotation_releases/109.20191205/GCF_000001405.39_GRCh38.p13/" +) +HUMAN_GFF = "GCF_000001405.39_GRCh38.p13_genomic.gff" +TESTFILELIST = ["test_file.gff"] + + +written_df = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "region", + 1, + 4000, + ".", + "+", + ".", + "Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=" + "genomic DNA;serovar=Typhimurium;strain=SL1344", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 13, + 235, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=" + "cds0;Name=YP_005179941.1;Parent=gene1;gbkey=CDS;product=thr operon" + " leader peptide;protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 341, + 523, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=" + "cds0;Name=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon" + " leader peptide;protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 600, + ".", + "-", + ".", + "ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 21, + 345, + ".", + "-", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=" + "cds0;Name=YP_005179941.1;Parent=gene3;gbkey=CDS;product=thr operon" + " leader peptide;protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 41, + 255, + ".", + "+", + ".", + "ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 61, + 195, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=" + "cds0;Name=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon" + " leader peptide;protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 170, + 546, + ".", + "+", + ".", + "ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 34, + 335, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=" + "cds0;Name=YP_005179941.1;Parent=gene5;gbkey=CDS;product=thr operon" + " leader peptide;protein_id=YP_005179941.1;transl_table=11", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], +) + +written_header = "##gff-version 3\n" "##sequence-region NC_016810.1 1 20\n" + + +written_csv = ( + "seq_id,source,type,start,end,score,strand,phase,attributes\n" + "NC_016810.1,RefSeq,region,1,4000,.,+,.,Dbxref=taxon:216597;ID=" + "id0;gbkey=Src;genome=genomic;mol_type=genomic DNA;serovar=" + "Typhimurium;strain=SL1344\n" + "NC_016810.1,RefSeq,gene,1,20,.,+,.,ID=gene1;Name=thrL;gbkey=" + "Gene;gene=thrL;locus_tag=SL1344_0001\n" + "NC_016810.1,RefSeq,CDS,13,235,.,+,0,Dbxref=UniProtKB%252FTr" + "EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799" + "41.1;Parent=gene1;gbkey=CDS;product=thr operon leader peptide;" + "protein_id=YP_005179941.1;transl_table=11\n" + "NC_016810.1,RefSeq,gene,1,20,.,+,.,ID=gene2;Name=thrA;gbkey=" + "Gene;gene=thrA;locus_tag=SL1344_0002\n" + "NC_016810.1,RefSeq,CDS,341,523,.,+,0,Dbxref=UniProtKB%252FTr" + "EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799" + "41.1;Parent=gene2;gbkey=CDS;product=thr operon leader peptide;" + "protein_id=YP_005179941.1;transl_table=11\n" + "NC_016810.1,RefSeq,gene,1,600,.,-,.,ID=gene3;Name=thrX;gbkey=" + "Gene;gene=thrX;locus_tag=SL1344_0003\n" + "NC_016810.1,RefSeq,CDS,21,345,.,-,0,Dbxref=UniProtKB%252FTr" + "EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799" + "41.1;Parent=gene3;gbkey=CDS;product=thr operon leader peptide;" + "protein_id=YP_005179941.1;transl_table=11\n" + "NC_016810.1,RefSeq,gene,41,255,.,+,.,ID=gene4;Name=thrB;gbkey=" + "Gene;gene=thrB;locus_tag=SL1344_0004\n" + "NC_016810.1,RefSeq,CDS,61,195,.,+,0,Dbxref=UniProtKB%252FTr" + "EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799" + "41.1;Parent=gene4;gbkey=CDS;product=thr operon leader peptide;" + "protein_id=YP_005179941.1;transl_table=11\n" + "NC_016810.1,RefSeq,gene,170,546,.,+,.,ID=gene5;Name=thrC;gbkey" + "=Gene;gene=thrC;locus_tag=SL1344_0005\n" + "NC_016810.1,RefSeq,CDS,34,335,.,+,0,Dbxref=UniProtKB%252FTr" + "EMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051799" + "41.1;Parent=gene5;gbkey=CDS;product=thr operon leader peptide;" + "protein_id=YP_005179941.1;transl_table=11\n" +) + +written_tsv = ( + "seq_id\tsource\ttype\tstart\tend\tscore\tstrand\tphase\t" + "attributes\n" + "NC_016810.1\tRefSeq\tregion\t1\t4000\t.\t+\t.\tDbxref=taxon:21" + "6597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic DNA;" + "serovar=Typhimurium;strain=SL1344\n" + "NC_016810.1\tRefSeq\tgene\t1\t20\t.\t+\t.\tID=gene1;Name=thrL;" + "gbkey=Gene;gene=thrL;locus_tag=SL1344_0001\n" + "NC_016810.1\tRefSeq\tCDS\t13\t235\t.\t+\t0\tDbxref=UniProtKB%2" + "52FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051" + "79941.1;Parent=gene1;gbkey=CDS;product=thr operon leader " + "peptide;protein_id=YP_005179941.1;transl_table=11\n" + "NC_016810.1\tRefSeq\tgene\t1\t20\t.\t+\t.\tID=gene2;Name=thrA;" + "gbkey=Gene;gene=thrA;locus_tag=SL1344_0002\n" + "NC_016810.1\tRefSeq\tCDS\t341\t523\t.\t+\t0\tDbxref=UniProtKB%" + "252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_005" + "179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader " + "peptide;protein_id=YP_005179941.1;transl_table=11\n" + "NC_016810.1\tRefSeq\tgene\t1\t600\t.\t-\t.\tID=gene3;Name=thrX" + ";gbkey=Gene;gene=thrX;locus_tag=SL1344_0003\n" + "NC_016810.1\tRefSeq\tCDS\t21\t345\t.\t-\t0\tDbxref=UniProtKB%2" + "52FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051" + "79941.1;Parent=gene3;gbkey=CDS;product=thr operon leader " + "peptide;protein_id=YP_005179941.1;transl_table=11\n" + "NC_016810.1\tRefSeq\tgene\t41\t255\t.\t+\t.\tID=gene4;Name=" + "thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004\n" + "NC_016810.1\tRefSeq\tCDS\t61\t195\t.\t+\t0\tDbxref=UniProtKB%2" + "52FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=YP_0051" + "79941.1;Parent=gene4;gbkey=CDS;product=thr operon leader " + "peptide;protein_id=YP_005179941.1;transl_table=11\n" + "NC_016810.1\tRefSeq\tgene\t170\t546\t.\t+\t.\tID=gene5;Name=" + "thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005\n" + "NC_016810.1\tRefSeq\tCDS\t34\t335\t.\t+\t0\tDbxref=UniProt" + "KB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name=" + "YP_005179941.1;Parent=gene5;gbkey=CDS;product=thr operon " + "leader peptide;protein_id=YP_005179941.1;transl_table=11\n" +) + +written_gff = ( + "##gff-version 3\n" + "##sequence-region NC_016810.1 1 20\n" + "NC_016810.1 RefSeq region 1 4000 . +" + " . Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=ge" + "nomic;mol_type=genomic DNA;serovar=Typhimurium;strain=SL1344\n" + "NC_016810.1 RefSeq gene 1 20 . +" + " . ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_" + "tag=SL1344_0001\n" + "NC_016810.1 RefSeq CDS 13 235 . +" + " 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y" + "P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene1;gbkey=C" + "DS;product=thr operon leader peptide;protein_id=YP_005179941.1" + ";transl_table=11\n" + "NC_016810.1 RefSeq gene 1 20 . +" + " . ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_" + "tag=SL1344_0002\n" + "NC_016810.1 RefSeq CDS 341 523 . +" + " 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y" + "P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene2;gbkey=C" + "DS;product=thr operon leader peptide;protein_id=YP_005179941.1" + ";transl_table=11\n" + "NC_016810.1 RefSeq gene 1 600 . -" + " . ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_" + "tag=SL1344_0003\n" + "NC_016810.1 RefSeq CDS 21 345 . -" + " 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y" + "P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene3;gbkey=C" + "DS;product=thr operon leader peptide;protein_id=YP_005179941.1" + ";transl_table=11\n" + "NC_016810.1 RefSeq gene 41 255 . +" + " . ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_" + "tag=SL1344_0004\n" + "NC_016810.1 RefSeq CDS 61 195 . +" + " 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y" + "P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene4;gbkey=C" + "DS;product=thr operon leader peptide;protein_id=YP_005179941.1" + ";transl_table=11\n" + "NC_016810.1 RefSeq gene 170 546 . +" + " . ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_" + "tag=SL1344_0005\n" + "NC_016810.1 RefSeq CDS 34 335 . +" + " 0 Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:Y" + "P_005179941.1;ID=cds0;Name=YP_005179941.1;Parent=gene5;gbkey=C" + "DS;product=thr operon leader peptide;protein_id=YP_005179941.1" + ";transl_table=11\n" +) + + +written_filtered_length = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 13, + 235, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene1;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 341, + 523, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 41, + 255, + ".", + "+", + ".", + "ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 61, + 195, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[1, 2, 3, 4, 7, 8], +) + +compare_get_feature_by_attribute = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 600, + ".", + "-", + ".", + "ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 41, + 255, + ".", + "+", + ".", + "ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 170, + 546, + ".", + "+", + ".", + "ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[1, 3, 5, 7, 9], +) + +compare_get_feature_by_attribute2 = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "CDS", + 341, + 523, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 21, + 345, + ".", + "-", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=" + "cds0;Name=YP_005179941.1;Parent=gene3;gbkey=CDS;product=thr operon" + " leader peptide;protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 61, + 195, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=" + "cds0;Name=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon" + " leader peptide;protein_id=YP_005179941.1;transl_table=11", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[4, 6, 8], +) + + +written_attribute_df = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "region", + 1, + 4000, + ".", + "+", + ".", + "Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic" + " DNA;serovar=Typhimurium;strain=SL1344", + "taxon:216597", + "id0", + None, + None, + "Src", + None, + "genomic", + None, + "genomic DNA", + None, + None, + "Typhimurium", + "SL1344", + None, + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001", + None, + "gene1", + "thrL", + None, + "Gene", + "thrL", + None, + "SL1344_0001", + None, + None, + None, + None, + None, + None, + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 13, + 235, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;" + "Name=YP_005179941.1;Parent=gene1;gbkey=CDS;product=thr operon leader" + " peptide;protein_id=YP_005179941.1;transl_table=11", + "UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1", + "cds0", + "YP_005179941.1", + "gene1", + "CDS", + None, + None, + None, + None, + "thr operon leader peptide", + "YP_005179941.1", + None, + None, + "11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002", + None, + "gene2", + "thrA", + None, + "Gene", + "thrA", + None, + "SL1344_0002", + None, + None, + None, + None, + None, + None, + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 341, + 523, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;" + "Name=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader" + " peptide;protein_id=YP_005179941.1;transl_table=11", + "UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1", + "cds0", + "YP_005179941.1", + "gene2", + "CDS", + None, + None, + None, + None, + "thr operon leader peptide", + "YP_005179941.1", + None, + None, + "11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 600, + ".", + "-", + ".", + "ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003", + None, + "gene3", + "thrX", + None, + "Gene", + "thrX", + None, + "SL1344_0003", + None, + None, + None, + None, + None, + None, + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 21, + 345, + ".", + "-", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;" + "Name=YP_005179941.1;Parent=gene3;gbkey=CDS;product=thr operon leader" + " peptide;protein_id=YP_005179941.1;transl_table=11", + "UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1", + "cds0", + "YP_005179941.1", + "gene3", + "CDS", + None, + None, + None, + None, + "thr operon leader peptide", + "YP_005179941.1", + None, + None, + "11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 41, + 255, + ".", + "+", + ".", + "ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004", + None, + "gene4", + "thrB", + None, + "Gene", + "thrB", + None, + "SL1344_0004", + None, + None, + None, + None, + None, + None, + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 61, + 195, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;" + "Name=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon leader" + " peptide;protein_id=YP_005179941.1;transl_table=11", + "UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1", + "cds0", + "YP_005179941.1", + "gene4", + "CDS", + None, + None, + None, + None, + "thr operon leader peptide", + "YP_005179941.1", + None, + None, + "11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 170, + 546, + ".", + "+", + ".", + "ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005", + None, + "gene5", + "thrC", + None, + "Gene", + "thrC", + None, + "SL1344_0005", + None, + None, + None, + None, + None, + None, + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 34, + 335, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;" + "Name=YP_005179941.1;Parent=gene5;gbkey=CDS;product=thr operon leader" + " peptide;protein_id=YP_005179941.1;transl_table=11", + "UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1", + "cds0", + "YP_005179941.1", + "gene5", + "CDS", + None, + None, + None, + None, + "thr operon leader peptide", + "YP_005179941.1", + None, + None, + "11", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + "Dbxref", + "ID", + "Name", + "Parent", + "gbkey", + "gene", + "genome", + "locus_tag", + "mol_type", + "product", + "protein_id", + "serovar", + "strain", + "transl_table", + ], +) + + +strand_counts = pd.value_counts(written_df["strand"]).to_dict() +type_counts = pd.value_counts(written_df["type"]).to_dict() compare_stats_dic = { - 'Maximal_bp_length': - 599, - 'Minimal_bp_length': - 19, - 'Counted_strands': - strand_counts, - 'Counted_feature_types': - type_counts - } - - -df_empty = pd.DataFrame({}, columns=["seq_id", "source", "type", "start", - "end", "score", "strand", "phase", - "attributes"], index=[]) - -redundant_entry = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002'], - ], columns=["seq_id", "source", "type", "start", "end", "score", - "strand", "phase", "attributes"], - index=[3]) - -compare_filter_feature_df = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 600, '.', '-', '.', - 'ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003'], - ['NC_016810.1', 'RefSeq', 'gene', 41, 255, '.', '+', '.', - 'ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004'], - ['NC_016810.1', 'RefSeq', 'gene', 170, 546, '.', '+', '.', - 'ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005'], - ], columns=["seq_id", "source", "type", "start", "end", - "score", "strand", "phase", "attributes"], - index=[1, 3, 5, 7, 9]) - -compare_overlap_gene_1_40 = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002'], - ], columns=["seq_id", "source", "type", "start", "end", "score", - "strand", "phase", "attributes"], - index=[1, 3]) - -compare_overlap_40_300 = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'region', 1, 4000, '.', '+', '.', - 'Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic DNA' - ';serovar=Typhimurium;strain=SL1344'], - ['NC_016810.1', 'RefSeq', 'CDS', 13, 235, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene1;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 41, 255, '.', '+', '.', - 'ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004'], - ['NC_016810.1', 'RefSeq', 'CDS', 61, 195, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 170, 546, '.', '+', '.', - 'ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005'], - ['NC_016810.1', 'RefSeq', 'CDS', 34, 335, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene5;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ], columns=["seq_id", "source", "type", "start", "end", "score", - "strand", "phase", "attributes"], - index=[0, 2, 7, 8, 9, 10]) - -compare_overlap_170_171 = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'gene', 1, 600, '.', '-', '.', - 'ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003'], - ['NC_016810.1', 'RefSeq', 'CDS', 21, 345, '.', '-', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene3;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ], columns=["seq_id", "source", "type", "start", "end", "score", - "strand", "phase", "attributes"], - index=[5, 6]) - -compare_overlap_525_545 = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'region', 1, 4000, '.', '+', '.', - 'Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic DNA' - ';serovar=Typhimurium;strain=SL1344'], - ['NC_016810.1', 'RefSeq', 'gene', 170, 546, '.', '+', '.', - 'ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005'], - ], columns=["seq_id", "source", "type", "start", "end", "score", - "strand", "phase", "attributes"], - index=[0, 9]) - -compare_overlap_341_500 = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'region', 1, 4000, '.', '+', '.', - 'Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic DNA' - ';serovar=Typhimurium;strain=SL1344'], - ['NC_016810.1', 'RefSeq', 'CDS', 341, 523, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;' - 'Name=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader pep' - 'tide;protein_id=YP_005179941.1;transl_table=11'], - ['NC_016810.1', 'RefSeq', 'gene', 170, 546, '.', '+', '.', - 'ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005'], - ], columns=["seq_id", "source", "type", "start", "end", "score", - "strand", "phase", "attributes"], - index=[0, 4, 9]) - - -compare_complement = pd.DataFrame([ - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001'], - ['NC_016810.1', 'RefSeq', 'gene', 1, 20, '.', '+', '.', - 'ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002'], - ['NC_016810.1', 'RefSeq', 'CDS', 341, 523, '.', '+', '0', - 'Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name' - '=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader peptide' - ';protein_id=YP_005179941.1;transl_table=11'], - ], columns=["seq_id", "source", "type", "start", "end", "score", - "strand", "phase", "attributes"], - index=[1, 3, 4]) + "Maximal_bp_length": 599, + "Minimal_bp_length": 19, + "Counted_strands": strand_counts, + "Counted_feature_types": type_counts, +} + + +df_empty = pd.DataFrame( + {}, + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[], +) + +redundant_entry = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[3], +) + +compare_filter_feature_df = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 600, + ".", + "-", + ".", + "ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 41, + 255, + ".", + "+", + ".", + "ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 170, + 546, + ".", + "+", + ".", + "ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[1, 3, 5, 7, 9], +) + +compare_overlap_gene_1_40 = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[1, 3], +) + +compare_overlap_40_300 = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "region", + 1, + 4000, + ".", + "+", + ".", + "Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic DNA" + ";serovar=Typhimurium;strain=SL1344", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 13, + 235, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene1;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 41, + 255, + ".", + "+", + ".", + "ID=gene4;Name=thrB;gbkey=Gene;gene=thrB;locus_tag=SL1344_0004", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 61, + 195, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene4;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 170, + 546, + ".", + "+", + ".", + "ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 34, + 335, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene5;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[0, 2, 7, 8, 9, 10], +) + +compare_overlap_170_171 = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 600, + ".", + "-", + ".", + "ID=gene3;Name=thrX;gbkey=Gene;gene=thrX;locus_tag=SL1344_0003", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 21, + 345, + ".", + "-", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene3;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[5, 6], +) + +compare_overlap_525_545 = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "region", + 1, + 4000, + ".", + "+", + ".", + "Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic DNA" + ";serovar=Typhimurium;strain=SL1344", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 170, + 546, + ".", + "+", + ".", + "ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[0, 9], +) + +compare_overlap_341_500 = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "region", + 1, + 4000, + ".", + "+", + ".", + "Dbxref=taxon:216597;ID=id0;gbkey=Src;genome=genomic;mol_type=genomic DNA" + ";serovar=Typhimurium;strain=SL1344", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 341, + 523, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;" + "Name=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader pep" + "tide;protein_id=YP_005179941.1;transl_table=11", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 170, + 546, + ".", + "+", + ".", + "ID=gene5;Name=thrC;gbkey=Gene;gene=thrC;locus_tag=SL1344_0005", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[0, 4, 9], +) + + +compare_complement = pd.DataFrame( + [ + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene1;Name=thrL;gbkey=Gene;gene=thrL;locus_tag=SL1344_0001", + ], + [ + "NC_016810.1", + "RefSeq", + "gene", + 1, + 20, + ".", + "+", + ".", + "ID=gene2;Name=thrA;gbkey=Gene;gene=thrA;locus_tag=SL1344_0002", + ], + [ + "NC_016810.1", + "RefSeq", + "CDS", + 341, + 523, + ".", + "+", + "0", + "Dbxref=UniProtKB%252FTrEMBL:E1W7M4%2CGenbank:YP_005179941.1;ID=cds0;Name" + "=YP_005179941.1;Parent=gene2;gbkey=CDS;product=thr operon leader peptide" + ";protein_id=YP_005179941.1;transl_table=11", + ], + ], + columns=[ + "seq_id", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + index=[1, 3, 4], +) def generate_gff3_df(): - read_in_file = gff3pd.read_gff3('fixtures/test_file.gff') + read_in_file = gff3pd.read_gff3("test_file.gff") return read_in_file -def test_read_gff3_if_df_type(): - gff3_df = generate_gff3_df() - assert type(gff3_df) == gff3pd.Gff3DataFrame - pd.testing.assert_frame_equal(gff3_df.df, written_df) +# @print_docstring() +# def test_clean_datadir(request): +# """Clean up datadir.""" +# testdir = Path(request.fspath.dirpath()) +# datadir = testdir / "data" +# if datadir.exists(): +# shutil.rmtree(datadir) # remove anything left in data directory + +# @print_docstring() +# def test_setup_datadir(request, datadir_mgr, capsys): +# """Copy in and download static data.""" +# testdir = Path(request.fspath.dirpath()) +# datadir = testdir / "data" +# filesdir = testdir / "testdata" +# shutil.copytree(filesdir, datadir) +# with capsys.disabled(): +# datadir_mgr.download( +# download_url=REFSEQ_URL, +# files=[HUMAN_GFF], +# scope="global", +# md5_check=False, +# gunzip=True, +# progressbar=True, +# ) + + +@print_docstring() +def test_read_gff3_if_df_type(datadir_mgr): + """Test basic gff3dataframe creation.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + assert type(gff3_df) == gff3pd.Gff3DataFrame + pd.testing.assert_frame_equal(gff3_df.df, written_df) + + +@print_docstring() +def test_generate_gff_header(datadir_mgr): + """Test GFF header generation.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + object_header = generate_gff3_df() + generate_header = object_header._read_gff_header() + assert type(object_header) == gff3pd.Gff3DataFrame + assert object_header.header == written_header + assert generate_header == written_header + + +@print_docstring() +def test_if_df_values_equal_gff_values(datadir_mgr): + """Testing whether dataframe values equal input GFF values.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + test_df_object = generate_gff3_df() + test_df = test_df_object._read_gff3_to_df() + assert type(test_df_object) == gff3pd.Gff3DataFrame + pd.testing.assert_frame_equal(test_df, written_df) + + +@print_docstring() +def test_to_csv(datadir_mgr): + """Test CSV file creation.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + gff3_df.to_csv("temp.csv") + csv_content = open("temp.csv").read() + assert csv_content == written_csv + + +@print_docstring() +def test_to_tsv(datadir_mgr): + """Test TSV file creation.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + gff3_df.to_tsv("temp.tsv") + tsv_content = open("temp.tsv").read() + assert tsv_content == written_tsv + + +@print_docstring() +def test_to_gff3(datadir_mgr): + """Test GFF file creation and rereading.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + gff3_df.to_gff3("temp.gff") + gff_content = open("temp.gff").read() + assert gff_content == written_gff + read_gff_output = gff3pd.read_gff3("temp.gff") + read_in_file = gff3pd.read_gff3("test_file.gff") + pd.testing.assert_frame_equal(read_in_file.df, read_gff_output.df) + + +@print_docstring() +def test_filter_feature_of_type(datadir_mgr): + """Test feature filtering.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + object_type_df = gff3_df.filter_feature_of_type(["gene"]) + assert type(object_type_df) == gff3pd.Gff3DataFrame + assert object_type_df.df.empty == compare_filter_feature_df.empty + pd.testing.assert_frame_equal(object_type_df.df, compare_filter_feature_df) + assert object_type_df.header == written_header + + +@print_docstring() +def test_filter_by_length(datadir_mgr): + """Test filtering by length.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + filtered_length = gff3_df.filter_by_length(min_length=10, max_length=300) + assert type(filtered_length) == gff3pd.Gff3DataFrame + pd.testing.assert_frame_equal(filtered_length.df, written_filtered_length) + assert filtered_length.header == written_header + + +@print_docstring() +def test_get_feature_by_attribute(datadir_mgr): + """Test get feature by attibute.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + filtered_gff3_df = gff3_df.get_feature_by_attribute("gbkey", ["Gene"]) + filtered_gff3_df2 = gff3_df.get_feature_by_attribute( + "Parent", ["gene2", "gene3", "gene4"] + ) + filtered_gff3_df3 = gff3_df.get_feature_by_attribute( + "locus_tag", ["SL1344_0006"] + ) + assert type(filtered_gff3_df) == gff3pd.Gff3DataFrame + assert type(filtered_gff3_df2) == gff3pd.Gff3DataFrame + assert type(filtered_gff3_df3) == gff3pd.Gff3DataFrame + assert filtered_gff3_df.df.shape == (5, 9) + pd.testing.assert_frame_equal( + filtered_gff3_df.df, compare_get_feature_by_attribute + ) + pd.testing.assert_frame_equal( + filtered_gff3_df2.df, compare_get_feature_by_attribute2 + ) + assert filtered_gff3_df3.df.shape == df_empty.shape + + +@print_docstring() +def test_attributes_to_columns(datadir_mgr): + """Test attributes to columns.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + gff3_df_with_attr_columns = gff3_df.attributes_to_columns() + assert gff3_df_with_attr_columns.shape == (11, 23) + assert gff3_df_with_attr_columns.shape == written_attribute_df.shape + assert type(gff3_df_with_attr_columns) == type(written_attribute_df) + pd.testing.assert_frame_equal(gff3_df_with_attr_columns, written_attribute_df) + + +@print_docstring() +def test_stats_dic(datadir_mgr): + """Test stats dictionary.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + stats_dict = gff3_df.stats_dic() + assert type(stats_dict) == type(compare_stats_dic) + assert stats_dict.keys() == compare_stats_dic.keys() + assert stats_dict["Maximal_bp_length"] == compare_stats_dic["Maximal_bp_length"] + assert stats_dict["Minimal_bp_length"] == compare_stats_dic["Minimal_bp_length"] + assert stats_dict["Counted_strands"] == compare_stats_dic["Counted_strands"] + assert ( + stats_dict["Counted_feature_types"] + == compare_stats_dic["Counted_feature_types"] + ) + + +@print_docstring() +def test_overlaps_with(datadir_mgr): + """Test finding overlaps.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + overlap_gene_1_40 = gff3_df.overlaps_with( + seq_id="NC_016810.1", type="gene", start=1, end=40, strand="+" + ) + overlap_40_300 = gff3_df.overlaps_with( + seq_id="NC_016810.1", start=40, end=300, strand="+" + ) + overlap_170_171 = gff3_df.overlaps_with( + seq_id="NC_016810.1", start=170, end=171, strand="-" + ) + overlap_525_545 = gff3_df.overlaps_with( + seq_id="NC_016810.1", start=525, end=545, strand="+" + ) + overlap_341_500 = gff3_df.overlaps_with( + seq_id="NC_016810.1", start=341, end=500, strand="+" + ) + complement_test = gff3_df.overlaps_with( + seq_id="NC_016810.1", start=40, end=300, strand="+", complement=True + ) + out_of_region = gff3_df.overlaps_with( + seq_id="NC_016810.1", start=1, end=4000, strand="+", complement=True + ) + assert type(overlap_gene_1_40) == gff3pd.Gff3DataFrame + assert type(overlap_40_300) == gff3pd.Gff3DataFrame + assert type(overlap_170_171) == gff3pd.Gff3DataFrame + assert type(overlap_525_545) == gff3pd.Gff3DataFrame + assert type(overlap_341_500) == gff3pd.Gff3DataFrame + assert type(complement_test) == gff3pd.Gff3DataFrame + assert type(out_of_region) == gff3pd.Gff3DataFrame + pd.testing.assert_frame_equal(overlap_gene_1_40.df, compare_overlap_gene_1_40) + pd.testing.assert_frame_equal(overlap_40_300.df, compare_overlap_40_300) + pd.testing.assert_frame_equal(overlap_170_171.df, compare_overlap_170_171) + pd.testing.assert_frame_equal(overlap_525_545.df, compare_overlap_525_545) + pd.testing.assert_frame_equal(overlap_341_500.df, compare_overlap_341_500) + pd.testing.assert_frame_equal(complement_test.df, compare_complement) + assert out_of_region.df.shape == df_empty.shape + + +@print_docstring() +def test_find_duplicated_entries(datadir_mgr): + """Test finding duplicated entries.""" + with datadir_mgr.in_tmp_dir(inpathlist=TESTFILELIST): + gff3_df = generate_gff3_df() + redundant_df = gff3_df.find_duplicated_entries( + seq_id="NC_016810.1", type="gene" + ) + redundant_df2 = gff3_df.find_duplicated_entries( + seq_id="NC_016810.1", type="CDS" + ) + assert type(redundant_df) == gff3pd.Gff3DataFrame + assert type(redundant_df2) == gff3pd.Gff3DataFrame + pd.testing.assert_frame_equal(redundant_df.df, redundant_entry) + assert redundant_df2.df.shape == df_empty.shape + assert redundant_df.df.empty == redundant_entry.empty + - -def test_generate_gff_header(): - object_header = generate_gff3_df() - generate_header = object_header._read_gff_header() - assert type(object_header) == gff3pd.Gff3DataFrame - assert object_header.header == written_header - assert generate_header == written_header - - -def test_if_df_values_equal_gff_values(): - test_df_object = generate_gff3_df() - test_df = test_df_object._read_gff3_to_df() - assert type(test_df_object) == gff3pd.Gff3DataFrame - pd.testing.assert_frame_equal(test_df, written_df) - - -def setup_module(module): - gff3_df = generate_gff3_df() - gff3_df.to_csv('temp.csv') - gff3_df.to_tsv('temp.tsv') - gff3_df.to_gff3('temp.gff') - global csv_content - global tsv_content - global gff_content - csv_content = open('temp.csv').read() - tsv_content = open('temp.tsv').read() - gff_content = open('temp.gff').read() - - -def test_to_csv(): - assert csv_content == written_csv - - -def test_to_tsv(): - assert tsv_content == written_tsv - - -def test_to_gff3(): - assert gff_content == written_gff - read_gff_output = gff3pd.read_gff3('temp.gff') - read_in_file = gff3pd.read_gff3('fixtures/test_file.gff') - pd.testing.assert_frame_equal(read_in_file.df, read_gff_output.df) - - -def teardown_module(module): - os.remove('temp.csv') - os.remove('temp.tsv') - os.remove('temp.gff') - - -def test_filter_feature_of_type(): - gff3_df = generate_gff3_df() - object_type_df = gff3_df.filter_feature_of_type(['gene']) - assert type(object_type_df) == gff3pd.Gff3DataFrame - assert object_type_df.df.empty == compare_filter_feature_df.empty - pd.testing.assert_frame_equal(object_type_df.df, - compare_filter_feature_df) - assert object_type_df.header == written_header - - -def test_filter_by_length(): - gff3_df = generate_gff3_df() - filtered_length = gff3_df.filter_by_length(min_length=10, max_length=300) - assert type(filtered_length) == gff3pd.Gff3DataFrame - pd.testing.assert_frame_equal(filtered_length.df, written_filtered_length) - assert filtered_length.header == written_header - - -def test_get_feature_by_attribute(): - gff3_df = generate_gff3_df() - filtered_gff3_df = gff3_df.get_feature_by_attribute('gbkey', ['Gene']) - filtered_gff3_df2 = gff3_df.get_feature_by_attribute('Parent', - ['gene2', 'gene3', - 'gene4']) - filtered_gff3_df3 = gff3_df.get_feature_by_attribute('locus_tag', - ['SL1344_0006']) - assert type(filtered_gff3_df) == gff3pd.Gff3DataFrame - assert type(filtered_gff3_df2) == gff3pd.Gff3DataFrame - assert type(filtered_gff3_df3) == gff3pd.Gff3DataFrame - assert filtered_gff3_df.df.shape == (5, 9) - pd.testing.assert_frame_equal(filtered_gff3_df.df, - compare_get_feature_by_attribute) - pd.testing.assert_frame_equal(filtered_gff3_df2.df, - compare_get_feature_by_attribute2) - assert filtered_gff3_df3.df.shape == df_empty.shape - - -def test_attributes_to_columns(): - gff3_df = generate_gff3_df() - gff3_df_with_attr_columns = gff3_df.attributes_to_columns() - assert gff3_df_with_attr_columns.shape == (11, 23) - assert gff3_df_with_attr_columns.shape == written_attribute_df.shape - assert type(gff3_df_with_attr_columns) == type(written_attribute_df) - pd.testing.assert_frame_equal(gff3_df_with_attr_columns, - written_attribute_df) - - -def test_stats_dic(): - gff3_df = generate_gff3_df() - stats_dict = gff3_df.stats_dic() - assert type(stats_dict) == type(compare_stats_dic) - assert stats_dict.keys() == compare_stats_dic.keys() - assert stats_dict['Maximal_bp_length'] == compare_stats_dic[ - 'Maximal_bp_length'] - assert stats_dict['Minimal_bp_length'] == compare_stats_dic[ - 'Minimal_bp_length'] - assert stats_dict['Counted_strands'] == compare_stats_dic[ - 'Counted_strands'] - assert stats_dict['Counted_feature_types'] == compare_stats_dic[ - 'Counted_feature_types'] - - -def test_overlaps_with(): - gff3_df = generate_gff3_df() - overlap_gene_1_40 = gff3_df.overlaps_with(seq_id='NC_016810.1', - type='gene', start=1, - end=40, strand='+') - overlap_40_300 = gff3_df.overlaps_with(seq_id='NC_016810.1', - start=40, end=300, strand='+') - overlap_170_171 = gff3_df.overlaps_with(seq_id='NC_016810.1', - start=170, end=171, strand='-') - overlap_525_545 = gff3_df.overlaps_with(seq_id='NC_016810.1', - start=525, end=545, strand='+') - overlap_341_500 = gff3_df.overlaps_with(seq_id='NC_016810.1', - start=341, end=500, strand='+') - complement_test = gff3_df.overlaps_with(seq_id='NC_016810.1', - start=40, end=300, strand='+', - complement=True) - out_of_region = gff3_df.overlaps_with(seq_id='NC_016810.1', - start=1, end=4000, strand='+', - complement=True) - assert type(overlap_gene_1_40) == gff3pd.Gff3DataFrame - assert type(overlap_40_300) == gff3pd.Gff3DataFrame - assert type(overlap_170_171) == gff3pd.Gff3DataFrame - assert type(overlap_525_545) == gff3pd.Gff3DataFrame - assert type(overlap_341_500) == gff3pd.Gff3DataFrame - assert type(complement_test) == gff3pd.Gff3DataFrame - assert type(out_of_region) == gff3pd.Gff3DataFrame - pd.testing.assert_frame_equal(overlap_gene_1_40.df, - compare_overlap_gene_1_40) - pd.testing.assert_frame_equal(overlap_40_300.df, compare_overlap_40_300) - pd.testing.assert_frame_equal(overlap_170_171.df, compare_overlap_170_171) - pd.testing.assert_frame_equal(overlap_525_545.df, compare_overlap_525_545) - pd.testing.assert_frame_equal(overlap_341_500.df, compare_overlap_341_500) - pd.testing.assert_frame_equal(complement_test.df, compare_complement) - assert out_of_region.df.shape == df_empty.shape - - -def test_find_duplicated_entries(): - gff3_df = generate_gff3_df() - redundant_df = gff3_df.find_duplicated_entries(seq_id='NC_016810.1', - type='gene') - redundant_df2 = gff3_df.find_duplicated_entries(seq_id='NC_016810.1', - type='CDS') - assert type(redundant_df) == gff3pd.Gff3DataFrame - assert type(redundant_df2) == gff3pd.Gff3DataFrame - pd.testing.assert_frame_equal(redundant_df.df, redundant_entry) - assert redundant_df2.df.shape == df_empty.shape - assert redundant_df.df.empty == redundant_entry.empty +def test_read_human_genome_gff(datadir_mgr): + """Test reading the human genome GFF.""" + with datadir_mgr.in_tmp_dir(inpathlist=[HUMAN_GFF]): + start = time.time() + human_gff = gff3pd.read_gff3(HUMAN_GFF) + delta = time.time() - start + print(f"Test reading the human genome GFF ({delta:.1f} s).") + assert len(human_gff.df) == 3706805 diff --git a/tests/fixtures/test_file.gff b/tests/testdata/test_file.gff similarity index 100% rename from tests/fixtures/test_file.gff rename to tests/testdata/test_file.gff diff --git a/travis_pypi_setup.py b/travis_pypi_setup.py deleted file mode 100644 index 3a335a0..0000000 --- a/travis_pypi_setup.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""Update encrypted deploy password in Travis config file.""" - - -from __future__ import print_function -import base64 -import json -import os -from getpass import getpass -import yaml -from cryptography.hazmat.primitives.serialization import load_pem_public_key -from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives.asymmetric.padding import PKCS1v15 - - -try: - from urllib import urlopen -except ImportError: - from urllib.request import urlopen - - -GITHUB_REPO = 'foerstner-lab/gffpandas' -TRAVIS_CONFIG_FILE = os.path.join( - os.path.dirname(os.path.abspath(__file__)), '.travis.yml') - - -def load_key(pubkey): - """Load public RSA key. - - Work around keys with incorrect header/footer format. - - Read more about RSA encryption with cryptography: - https://cryptography.io/latest/hazmat/primitives/asymmetric/rsa/ - """ - try: - return load_pem_public_key(pubkey.encode(), default_backend()) - except ValueError: - # workaround for https://github.com/travis-ci/travis-api/issues/196 - pubkey = pubkey.replace('BEGIN RSA', 'BEGIN').replace('END RSA', 'END') - return load_pem_public_key(pubkey.encode(), default_backend()) - - -def encrypt(pubkey, password): - """Encrypt password using given RSA public key and encode it with base64. - - The encrypted password can only be decrypted by someone with the - private key (in this case, only Travis). - """ - key = load_key(pubkey) - encrypted_password = key.encrypt(password, PKCS1v15()) - return base64.b64encode(encrypted_password) - - -def fetch_public_key(repo): - """Download RSA public key Travis will use for this repo. - - Travis API docs: http://docs.travis-ci.com/api/#repository-keys - """ - keyurl = 'https://api.travis-ci.org/repos/{0}/key'.format(repo) - data = json.loads(urlopen(keyurl).read().decode()) - if 'key' not in data: - errmsg = "Could not find public key for repo: {}.\n".format(repo) - errmsg += "Have you already added your GitHub repo to Travis?" - raise ValueError(errmsg) - return data['key'] - - -def prepend_line(filepath, line): - """Rewrite a file adding a line to its beginning.""" - with open(filepath) as f: - lines = f.readlines() - - lines.insert(0, line) - - with open(filepath, 'w') as f: - f.writelines(lines) - - -def load_yaml_config(filepath): - """Load yaml config file at the given path.""" - with open(filepath) as f: - return yaml.load(f) - - -def save_yaml_config(filepath, config): - """Save yaml config file at the given path.""" - with open(filepath, 'w') as f: - yaml.dump(config, f, default_flow_style=False) - - -def update_travis_deploy_password(encrypted_password): - """Put `encrypted_password` into the deploy section of .travis.yml.""" - config = load_yaml_config(TRAVIS_CONFIG_FILE) - - config['deploy']['password'] = dict(secure=encrypted_password) - - save_yaml_config(TRAVIS_CONFIG_FILE, config) - - line = ('# This file was autogenerated and will overwrite' - ' each time you run travis_pypi_setup.py\n') - prepend_line(TRAVIS_CONFIG_FILE, line) - - -def main(args): - """Add a PyPI password to .travis.yml so that Travis can deploy to PyPI. - - Fetch the Travis public key for the repo, and encrypt the PyPI password - with it before adding, so that only Travis can decrypt and use the PyPI - password. - """ - public_key = fetch_public_key(args.repo) - password = args.password or getpass('PyPI password: ') - update_travis_deploy_password(encrypt(public_key, password.encode())) - print("Wrote encrypted password to .travis.yml -- you're ready to deploy") - - -if '__main__' == __name__: - import argparse - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('--repo', default=GITHUB_REPO, - help='GitHub repo (default: %s)' % GITHUB_REPO) - parser.add_argument('--password', - help='PyPI password (will prompt if not provided)') - - args = parser.parse_args() - main(args) From e07bd81b8920929c3e977abef22bbe198e4e37cc Mon Sep 17 00:00:00 2001 From: Joel Berendzen Date: Tue, 8 Dec 2020 17:16:25 -0700 Subject: [PATCH 5/6] minor travis fix --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7cb10f5..915e0e6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ dist: focal -sudo: false +os: linux language: python cache: directories: From 27b1ac44febefdfbd7d65d44079d3c16d3e4ff3a Mon Sep 17 00:00:00 2001 From: Joel Berendzen Date: Wed, 9 Dec 2020 09:09:10 -0700 Subject: [PATCH 6/6] had copy-in commented out --- tests/test_gffpandas.py | 50 +++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/tests/test_gffpandas.py b/tests/test_gffpandas.py index f2ec4a4..61b1ad3 100644 --- a/tests/test_gffpandas.py +++ b/tests/test_gffpandas.py @@ -4,7 +4,9 @@ """Tests for `gffpandas` package.""" # standard library imports +import shutil import time +from pathlib import Path # first-party imports import gffpandas.gffpandas as gff3pd @@ -1284,30 +1286,30 @@ def generate_gff3_df(): return read_in_file -# @print_docstring() -# def test_clean_datadir(request): -# """Clean up datadir.""" -# testdir = Path(request.fspath.dirpath()) -# datadir = testdir / "data" -# if datadir.exists(): -# shutil.rmtree(datadir) # remove anything left in data directory - -# @print_docstring() -# def test_setup_datadir(request, datadir_mgr, capsys): -# """Copy in and download static data.""" -# testdir = Path(request.fspath.dirpath()) -# datadir = testdir / "data" -# filesdir = testdir / "testdata" -# shutil.copytree(filesdir, datadir) -# with capsys.disabled(): -# datadir_mgr.download( -# download_url=REFSEQ_URL, -# files=[HUMAN_GFF], -# scope="global", -# md5_check=False, -# gunzip=True, -# progressbar=True, -# ) +@print_docstring() +def test_clean_datadir(request): + """Clean up datadir.""" + testdir = Path(request.fspath.dirpath()) + datadir = testdir / "data" + if datadir.exists(): + shutil.rmtree(datadir) # remove anything left in data directory + +@print_docstring() +def test_setup_datadir(request, datadir_mgr, capsys): + """Copy in and download static data.""" + testdir = Path(request.fspath.dirpath()) + datadir = testdir / "data" + filesdir = testdir / "testdata" + shutil.copytree(filesdir, datadir) + with capsys.disabled(): + datadir_mgr.download( + download_url=REFSEQ_URL, + files=[HUMAN_GFF], + scope="global", + md5_check=False, + gunzip=True, + progressbar=True, + ) @print_docstring()