Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

splits_atts with test against human genome GFF #14

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,9 @@ target/

# pyenv python configuration file
.python-version

# virtual environment
venv/

# test data
tests/data/
33 changes: 11 additions & 22 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,26 +1,15 @@
# Config file for automatic testing at travis-ci.org
# This file will be regenerated if you run travis_pypi_setup.py

dist: focal
os: linux
language: python
cache:
directories:
- $HOME/.cache/pip
- $HOME/tests/data
python:
- 3.6

# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
- "3.8"
install:
- "pip install -r requirements_dev.txt"

# command to run tests
script: cd tests && pytest && cd ..
- pip install -r requirements.txt
- pip install -v pytest pytest-cov pytest-datadir-mgr
script:
- pytest --cov=gffpandas -s

# After you create the Github repo and add it to Travis, run the
# travis_pypi_setup.py script to finish PyPI deployment setup
# deploy:
# provider: pypi
# distributions: sdist bdist_wheel
# user: konrad
# password:
# secure: PLEASE_REPLACE_ME
# on:
# tags: true
# repo: konrad/pandasgff
# python: 2.7
1 change: 0 additions & 1 deletion fixtures

This file was deleted.

47 changes: 31 additions & 16 deletions gffpandas/gffpandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,22 @@ def read_gff3(input_file):
return Gff3DataFrame(input_file)


def _split_atts(atts):
"""Split a feature string into attributes."""
splits_list = [a.split("=") for a in atts.split(";") if "=" in a]
return {item[0]: "=".join(item[1:]) for item in splits_list}


class Gff3DataFrame(object):
"""This class contains header information in the header attribute and
a actual annotation data in the pandas dataframe in the df
attribute.

"""

def __init__(self, input_gff_file=None, input_df=None, input_header=None) -> None:
def __init__(
self, input_gff_file=None, input_df=None, input_header=None
) -> None:
"""Create an instance."""
if input_gff_file is not None:
self._gff_file = input_gff_file
Expand Down Expand Up @@ -146,7 +154,9 @@ def filter_feature_of_type(self, feature_type_list) -> "Gff3DataFrame":
feature_df = self.df.loc[self.df.type.isin(feature_type_list)]
return Gff3DataFrame(input_df=feature_df, input_header=self.header)

def filter_by_length(self, min_length=None, max_length=None) -> "Gff3DataFrame":
def filter_by_length(
self, min_length=None, max_length=None
) -> "Gff3DataFrame":
"""Filtering the pandas dataframe by the gene_length.

For this method the desired minimal and maximal bp length
Expand All @@ -165,7 +175,9 @@ def filter_by_length(self, min_length=None, max_length=None) -> "Gff3DataFrame":
filtered_by_length = self.df[
(gene_length >= min_length) & (gene_length <= max_length)
]
return Gff3DataFrame(input_df=filtered_by_length, input_header=self.header)
return Gff3DataFrame(
input_df=filtered_by_length, input_header=self.header
)

def attributes_to_columns(self) -> pd.DataFrame:
"""Saving each attribute-tag to a single column.
Expand All @@ -181,14 +193,7 @@ def attributes_to_columns(self) -> pd.DataFrame:
"""
attribute_df = self.df.copy()
df_attributes = attribute_df.loc[:, "seq_id":"attributes"]
attribute_df["at_dic"] = attribute_df.attributes.apply(
lambda attributes: dict(
[
key_value_pair.split(sep="=", maxsplit=1)
for key_value_pair in attributes.split(";")
]
)
)
attribute_df["at_dic"] = attribute_df.attributes.apply(_split_atts)
attribute_df["at_dic_keys"] = attribute_df["at_dic"].apply(
lambda at_dic: list(at_dic.keys())
)
Expand All @@ -202,7 +207,9 @@ def attributes_to_columns(self) -> pd.DataFrame:
)
return df_attributes

def get_feature_by_attribute(self, attr_tag, attr_value_list) -> "Gff3DataFrame":
def get_feature_by_attribute(
self, attr_tag, attr_value_list
) -> "Gff3DataFrame":
"""Filtering the pandas dataframe by a attribute.

The 9th column of a gff3-file contains the list of feature
Expand All @@ -226,8 +233,12 @@ def get_feature_by_attribute(self, attr_tag, attr_value_list) -> "Gff3DataFrame"
"""
df_copy = self.df.copy()
attribute_df = Gff3DataFrame.attributes_to_columns(self)
filtered_by_attr_df = df_copy.loc[attribute_df[attr_tag].isin(attr_value_list)]
return Gff3DataFrame(input_df=filtered_by_attr_df, input_header=self.header)
filtered_by_attr_df = df_copy.loc[
attribute_df[attr_tag].isin(attr_value_list)
]
return Gff3DataFrame(
input_df=filtered_by_attr_df, input_header=self.header
)

def stats_dic(self) -> dict:
"""Gives the following statistics for the data:
Expand Down Expand Up @@ -320,7 +331,9 @@ def overlaps_with(
overlap_df = overlap_df[~condition]
return Gff3DataFrame(input_df=overlap_df, input_header=self.header)

def find_duplicated_entries(self, seq_id=None, type=None) -> "Gff3DataFrame":
def find_duplicated_entries(
self, seq_id=None, type=None
) -> "Gff3DataFrame":
"""Find entries which are redundant.

For this method the chromosom accession number (seq_id) as well as the
Expand All @@ -338,5 +351,7 @@ def find_duplicated_entries(self, seq_id=None, type=None) -> "Gff3DataFrame":
"""
input_df = self.df[self.df.seq_id == seq_id]
df_feature = input_df[input_df.type == type]
duplicate = df_feature.loc[df_feature[["end", "start", "strand"]].duplicated()]
duplicate = df_feature.loc[
df_feature[["end", "start", "strand"]].duplicated()
]
return Gff3DataFrame(input_df=duplicate, input_header=self.header)
1 change: 0 additions & 1 deletion pytest.ini

This file was deleted.

13 changes: 0 additions & 13 deletions requirements_dev.txt

This file was deleted.

21 changes: 20 additions & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
# -*- coding: utf-8 -*-

"""Unit test package for gffpandas."""

# standard library imports
import functools


def print_docstring():
"""Decorator to print a docstring."""

def decorator(func):
"""Define decorator"""

@functools.wraps(func)
def wrapper(*args, **kwargs):
"""Print docstring and call function"""
print(func.__doc__)
return func(*args, **kwargs)

return wrapper

return decorator
1 change: 0 additions & 1 deletion tests/gffpandas

This file was deleted.

3 changes: 0 additions & 3 deletions tests/pytest.ini

This file was deleted.

Loading