Skip to content

Commit

Permalink
Merge pull request arq5x#1029 from brentp/htslib-faidx
Browse files Browse the repository at this point in the history
use htslib/faidx.h to get fasta sequences
  • Loading branch information
arq5x authored Nov 30, 2022
2 parents a658b71 + cddbadf commit 4c19da5
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 271 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ test/*/obs
test/genomecov/pair-chip.bam
test/bigchroms/bigx.fasta
test/bigchroms/bigx.fasta.fai
test/getfasta/*.gzi
test/getfasta/*.gz.fai
302 changes: 48 additions & 254 deletions src/utils/Fasta/Fasta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,15 @@
// ---------------------------------------------------------------------------
// Last modified: 9 February 2010 (EG)
// ---------------------------------------------------------------------------
// Modified to use htslib/faidx December 2022 (BP)

#include "Fasta.h"

using namespace std;

FastaIndexEntry::FastaIndexEntry(string name, CHRPOS length, CHRPOS offset, CHRPOS line_blen, CHRPOS line_len, bool useFullHeader)
FastaIndexEntry::FastaIndexEntry(string name, CHRPOS length)
: name(name)
, length(length)
, offset(offset)
, line_blen(line_blen)
, line_len(line_len)
, useFullHeader(useFullHeader)
{}

FastaIndexEntry::FastaIndexEntry(void) // empty constructor
Expand All @@ -29,189 +26,41 @@ void FastaIndexEntry::clear(void)
{
name = "";
length = 0;
offset = -1; // no real offset will ever be below 0, so this allows us to
// check if we have already recorded a real offset
line_blen = 0;
line_len = 0;
useFullHeader = false;
}

ostream& operator<<(ostream& output, const FastaIndexEntry& e) {
// just write the first component of the name, for compliance with other tools
output << (e.useFullHeader? e.name : split(e.name, ' ').at(0))
<< "\t" << e.length << "\t" << e.offset << "\t"
<< e.line_blen << "\t" << e.line_len;
return output; // for multiple << operators.
}

FastaIndex::FastaIndex(bool useFullHeader)
: useFullHeader(useFullHeader)
{}

void FastaIndex::readIndexFile(string fname) {
string line;
long long linenum = 0;
indexFile.open(fname.c_str(), ifstream::in);
if (indexFile.is_open()) {
while (getline (indexFile, line)) {
++linenum;
// the fai format defined in samtools is tab-delimited, every line being:
// fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len
vector<string> fields = split(line, '\t');
if (fields.size() == 5) { // if we don't get enough fields then there is a problem with the file
// note that fields[0] is the sequence name
char* end;
string name = useFullHeader ? fields[0] :
split(fields[0], " \t").at(0); // key by first token of name
sequenceNames.push_back(name);
this->insert(make_pair(name, FastaIndexEntry(fields[0], stoll(fields[1].c_str()),
strtoll(fields[2].c_str(), &end, 10),
stoll(fields[3].c_str()),
stoll(fields[4].c_str()),
useFullHeader)));
} else {
cerr << "Warning: malformed fasta index file " << fname <<
"does not have enough fields @ line " << linenum << endl;
cerr << line << endl;
exit(1);
}
}
} else {
cerr << "could not open index file " << fname << endl;
exit(1);
}
}

// for consistency this should be a class method
bool fastaIndexEntryCompare ( FastaIndexEntry a, FastaIndexEntry b) { return (a.offset<b.offset); }

ostream& operator<<(ostream& output, FastaIndex& fastaIndex) {
vector<FastaIndexEntry> sortedIndex;
for(vector<string>::const_iterator it = fastaIndex.sequenceNames.begin(); it != fastaIndex.sequenceNames.end(); ++it)
{
sortedIndex.push_back(fastaIndex[*it]);
}
sort(sortedIndex.begin(), sortedIndex.end(), fastaIndexEntryCompare);
for( vector<FastaIndexEntry>::iterator fit = sortedIndex.begin(); fit != sortedIndex.end(); ++fit) {
output << *fit << endl;
}
return output;
}
faidx = fai_load(fname.c_str());
const char *idx_path = (fname + ".fai").c_str();

void FastaIndex::indexReference(string refname) {
// overview:
// for line in the reference fasta file
// track byte offset from the start of the file
// if line is a fasta header, take the name and dump the last sequnece to the index
// if line is a sequence, add it to the current sequence
//cerr << "indexing fasta reference " << refname << endl;
string line;
FastaIndexEntry entry; // an entry buffer used in processing
entry.clear();
CHRPOS line_length = 0;
long long offset = 0; // byte offset from start of file
long long line_number = 0; // current line number
bool mismatchedLineLengths = false; // flag to indicate if our line length changes mid-file
// this will be used to raise an error
// if we have a line length change at
// any line other than the last line in
// the sequence
bool emptyLine = false; // flag to catch empty lines, which we allow for
// index generation only on the last line of the sequence
ifstream refFile;
refFile.open(refname.c_str());
if (refFile.is_open()) {
while (getline(refFile, line)) {
++line_number;
line_length = line.length();
if (line[0] == ';') {
// fasta comment, skip
} else if (line[0] == '+') {
// fastq quality header
getline(refFile, line);
line_length = line.length();
offset += line_length + 1;
// get and don't handle the quality line
getline(refFile, line);
line_length = line.length();
} else if (line[0] == '>' || line[0] == '@') { // fasta /fastq header
// if we aren't on the first entry, push the last sequence into the index
if (entry.name != "") {
mismatchedLineLengths = false; // reset line length error tracker for every new sequence
emptyLine = false;
flushEntryToIndex(entry);
entry.clear();
}
entry.name = line.substr(1, line_length - 1);
} else { // we assume we have found a sequence line
if (entry.offset == -1) // NB initially the offset is -1
entry.offset = offset;
entry.length += line_length;
if (entry.line_len) {
//entry.line_len = entry.line_len ? entry.line_len : line_length + 1;
if (mismatchedLineLengths || emptyLine) {
if (line_length == 0) {
emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
} else {
if (emptyLine) {
cerr << "ERROR: embedded newline";
} else {
cerr << "ERROR: mismatched line lengths";
}
cerr << " at line " << line_number << " within sequence " << entry.name <<
endl << "File not suitable for fasta index generation." << endl;
exit(1);
}
}
// this flag is set here and checked on the next line
// because we may have reached the end of the sequence, in
// which case a mismatched line length is OK
if (entry.line_len != line_length + 1) {
mismatchedLineLengths = true;
if (line_length == 0) {
emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
}
}
} else {
entry.line_len = line_length + 1; // first line
}
entry.line_blen = entry.line_len - 1;
}
offset += line_length + 1;
}
// we've hit the end of the fasta file!
// flush the last entry
flushEntryToIndex(entry);
} else {
cerr << "could not open reference file " << refname << " for indexing!" << endl;
// NOTE: can use faidx_set_cache_size to improve performance of repeated queries.
if(faidx == NULL) {
cerr << "Warning: malformed fasta index file " << fname << endl;
exit(1);
}
}

void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) {
string name = useFullHeader? entry.name :
split(entry.name, " \t").at(0); // key by first token of name
sequenceNames.push_back(name);
this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length,
entry.offset, entry.line_blen,
entry.line_len,
useFullHeader)));
}
struct stat index_attr, fasta_attr;
stat(idx_path, &index_attr);
stat(fname.c_str(), &fasta_attr);
if(fasta_attr.st_mtime > index_attr.st_mtime) {
cerr << "Warning: the index file is older than the FASTA file." << endl;
}

void FastaIndex::writeIndexFile(string fname) {
//cerr << "writing fasta index file " << fname << endl;
ofstream file;
file.open(fname.c_str());
if (file.is_open()) {
file << *this;
} else {
cerr << "could not open index file " << fname << " for writing!" << endl;
exit(1);
int n_seq = faidx_nseq(faidx);
for(int i = 0; i < n_seq; i++){
const char* chrom_name = faidx_iseq(faidx, i);
int chrom_len = faidx_seq_len(faidx, chrom_name); // TODO: update to faidx_seq_len64 when htslib is updated.
sequenceNames.push_back(chrom_name);
this->insert(make_pair(chrom_name, FastaIndexEntry(chrom_name, chrom_len)));
}
}

FastaIndex::~FastaIndex(void) {
indexFile.close();
fai_destroy(faidx);
}

FastaIndexEntry FastaIndex::entry(string name) {
Expand All @@ -236,111 +85,57 @@ string FastaIndex::indexFileExtension() { return ".fai"; }

void FastaReference::open(string reffilename, bool usemmap, bool useFullHeader) {
filename = reffilename;
if (!(file = fopen(filename.c_str(), "r"))) {
cerr << "could not open " << filename << endl;
exit(1);
}

if (useFullHeader)
usingfullheader = true;
index = new FastaIndex(useFullHeader);
struct stat stFileInfo;
string indexFileName = filename + index->indexFileExtension();
// if we can find an index file, use it
if(stat(indexFileName.c_str(), &stFileInfo) == 0) {
// check if the index file is older than the FASTA file
struct stat index_attrib, fasta_attrib;
stat(indexFileName.c_str(), &index_attrib);
stat(reffilename.c_str(), &fasta_attrib);
if (fasta_attrib.st_mtime > index_attrib.st_mtime) {
cerr << "Warning: the index file is older than the FASTA file." << endl;
}
index->readIndexFile(indexFileName);
} else { // otherwise, read the reference and generate the index file in the cwd
cerr << "index file " << indexFileName << " not found, generating..." << endl;
index->indexReference(filename);
index->writeIndexFile(indexFileName);
}
if (usemmap) {
usingmmap = true;
int fd = fileno(file);
struct stat sb;
if (fstat(fd, &sb) == -1)
cerr << "could not stat file" << filename << endl;
filesize = sb.st_size;
// map the whole file
filemm = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
}
index->readIndexFile(reffilename);
}

FastaReference::~FastaReference(void) {
fclose(file);
if (usingmmap) {
munmap(filemm, filesize);
}
delete index;
}

string FastaReference::getSequence(string seqname) {
FastaIndexEntry entry = index->entry(seqname);
CHRPOS newlines_in_sequence = entry.length / entry.line_blen;
CHRPOS seqlen = newlines_in_sequence + entry.length;
char* seq = (char*) calloc (seqlen + 1, sizeof(char));
if (usingmmap) {
memcpy(seq, (char*) filemm + entry.offset, seqlen);
} else {
fseek64(file, entry.offset, SEEK_SET);
fread(seq, sizeof(char), seqlen, file);
}
seq[seqlen] = '\0';
char* pbegin = seq;
char* pend = seq + (seqlen/sizeof(char));
pend = remove(pbegin, pend, '\n');
pend = remove(pbegin, pend, '\0');
string s = seq;
free(seq);
s.resize((pend - pbegin)/sizeof(char));
return s;
}

// TODO cleanup; odd function. use a map
string FastaReference::sequenceNameStartingWith(string seqnameStart) {
try {
return (*index)[seqnameStart].name;
} catch (exception& e) {
cerr << e.what() << ": unable to find index entry for " << seqnameStart << endl;
int len = 0;
// NOTE: could perhaps use string.reserve, then send the pointer to avoid the copy below.
char *seq = fai_fetch(index->faidx, seqname.c_str(), &len); // TODO: update to fai_fetch64 when htslib is updated.
if (len == -1) {
cerr << "error getting sequence:" << seqname << " general error" << endl;
exit(1);
}
if (len == -2) {
cerr << "error getting sequence:" << seqname << " not found in index" << endl;
exit(1);
}

string s = seq;
std::free(seq);
return s;
}

string FastaReference::getSubSequence(string seqname, CHRPOS start, CHRPOS length) {
//cerr << "searching:" << seqname << ":" << start << "-" << start + length << endl;
FastaIndexEntry entry = index->entry(seqname);
if (start < 0 || length < 1) {
cerr << "Error: cannot construct subsequence with negative offset or length < 1" << endl;
exit(1);
}
// we have to handle newlines
// approach: count newlines before start
// count newlines by end of read
// subtracting newlines before start find count of embedded newlines
CHRPOS newlines_before = start > 0 ? (start - 1) / entry.line_blen : 0;
CHRPOS newlines_by_end = (start + length - 1) / entry.line_blen;
CHRPOS newlines_inside = newlines_by_end - newlines_before;
CHRPOS seqlen = length + newlines_inside;
char* seq = (char*) calloc (seqlen + 1, sizeof(char));
if (usingmmap) {
memcpy(seq, (char*) filemm + entry.offset + newlines_before + start, seqlen);
} else {
fseek64(file, (off_t) (entry.offset + newlines_before + start), SEEK_SET);
fread(seq, sizeof(char), (off_t) seqlen, file);
int len = 0;
char *seq = faidx_fetch_seq(index->faidx, seqname.c_str(), start, start + length - 1, &len); // TODO: update to fai_fetch_seq64 when htslib is updated.
if (len == -1) {
cerr << "error getting sequence:" << seqname << " general error" << endl;
exit(1);
}
seq[seqlen] = '\0';
char* pbegin = seq;
char* pend = seq + (seqlen/sizeof(char));
pend = remove(pbegin, pend, '\n');
pend = remove(pbegin, pend, '\0');
if (len == -2) {
cerr << "error getting sequence:" << seqname << " not found in index" << endl;
exit(1);
}

string s = seq;
free(seq);
s.resize((pend - pbegin)/sizeof(char));
std::free(seq);
return s;
}

Expand All @@ -351,5 +146,4 @@ CHRPOS FastaReference::sequenceLength(string seqname) {
}
// 0 length means the chrom wasn't found
return 0;
}

}
Loading

0 comments on commit 4c19da5

Please sign in to comment.