diff --git a/.gitignore b/.gitignore index a685aeab4..0841cb8d9 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,5 @@ test/*/obs test/genomecov/pair-chip.bam test/bigchroms/bigx.fasta test/bigchroms/bigx.fasta.fai +test/getfasta/*.gzi +test/getfasta/*.gz.fai diff --git a/src/utils/Fasta/Fasta.cpp b/src/utils/Fasta/Fasta.cpp index 71e2a35be..4cc53ba8d 100644 --- a/src/utils/Fasta/Fasta.cpp +++ b/src/utils/Fasta/Fasta.cpp @@ -5,18 +5,15 @@ // --------------------------------------------------------------------------- // Last modified: 9 February 2010 (EG) // --------------------------------------------------------------------------- +// Modified to use htslib/faidx December 2022 (BP) #include "Fasta.h" using namespace std; -FastaIndexEntry::FastaIndexEntry(string name, CHRPOS length, CHRPOS offset, CHRPOS line_blen, CHRPOS line_len, bool useFullHeader) +FastaIndexEntry::FastaIndexEntry(string name, CHRPOS length) : name(name) , length(length) - , offset(offset) - , line_blen(line_blen) - , line_len(line_len) - , useFullHeader(useFullHeader) {} FastaIndexEntry::FastaIndexEntry(void) // empty constructor @@ -29,19 +26,6 @@ void FastaIndexEntry::clear(void) { name = ""; length = 0; - offset = -1; // no real offset will ever be below 0, so this allows us to - // check if we have already recorded a real offset - line_blen = 0; - line_len = 0; - useFullHeader = false; -} - -ostream& operator<<(ostream& output, const FastaIndexEntry& e) { - // just write the first component of the name, for compliance with other tools - output << (e.useFullHeader? e.name : split(e.name, ' ').at(0)) - << "\t" << e.length << "\t" << e.offset << "\t" - << e.line_blen << "\t" << e.line_len; - return output; // for multiple << operators. } FastaIndex::FastaIndex(bool useFullHeader) @@ -49,169 +33,34 @@ FastaIndex::FastaIndex(bool useFullHeader) {} void FastaIndex::readIndexFile(string fname) { - string line; - long long linenum = 0; - indexFile.open(fname.c_str(), ifstream::in); - if (indexFile.is_open()) { - while (getline (indexFile, line)) { - ++linenum; - // the fai format defined in samtools is tab-delimited, every line being: - // fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len - vector fields = split(line, '\t'); - if (fields.size() == 5) { // if we don't get enough fields then there is a problem with the file - // note that fields[0] is the sequence name - char* end; - string name = useFullHeader ? fields[0] : - split(fields[0], " \t").at(0); // key by first token of name - sequenceNames.push_back(name); - this->insert(make_pair(name, FastaIndexEntry(fields[0], stoll(fields[1].c_str()), - strtoll(fields[2].c_str(), &end, 10), - stoll(fields[3].c_str()), - stoll(fields[4].c_str()), - useFullHeader))); - } else { - cerr << "Warning: malformed fasta index file " << fname << - "does not have enough fields @ line " << linenum << endl; - cerr << line << endl; - exit(1); - } - } - } else { - cerr << "could not open index file " << fname << endl; - exit(1); - } -} - -// for consistency this should be a class method -bool fastaIndexEntryCompare ( FastaIndexEntry a, FastaIndexEntry b) { return (a.offset sortedIndex; - for(vector::const_iterator it = fastaIndex.sequenceNames.begin(); it != fastaIndex.sequenceNames.end(); ++it) - { - sortedIndex.push_back(fastaIndex[*it]); - } - sort(sortedIndex.begin(), sortedIndex.end(), fastaIndexEntryCompare); - for( vector::iterator fit = sortedIndex.begin(); fit != sortedIndex.end(); ++fit) { - output << *fit << endl; - } - return output; -} + faidx = fai_load(fname.c_str()); + const char *idx_path = (fname + ".fai").c_str(); -void FastaIndex::indexReference(string refname) { - // overview: - // for line in the reference fasta file - // track byte offset from the start of the file - // if line is a fasta header, take the name and dump the last sequnece to the index - // if line is a sequence, add it to the current sequence - //cerr << "indexing fasta reference " << refname << endl; - string line; - FastaIndexEntry entry; // an entry buffer used in processing - entry.clear(); - CHRPOS line_length = 0; - long long offset = 0; // byte offset from start of file - long long line_number = 0; // current line number - bool mismatchedLineLengths = false; // flag to indicate if our line length changes mid-file - // this will be used to raise an error - // if we have a line length change at - // any line other than the last line in - // the sequence - bool emptyLine = false; // flag to catch empty lines, which we allow for - // index generation only on the last line of the sequence - ifstream refFile; - refFile.open(refname.c_str()); - if (refFile.is_open()) { - while (getline(refFile, line)) { - ++line_number; - line_length = line.length(); - if (line[0] == ';') { - // fasta comment, skip - } else if (line[0] == '+') { - // fastq quality header - getline(refFile, line); - line_length = line.length(); - offset += line_length + 1; - // get and don't handle the quality line - getline(refFile, line); - line_length = line.length(); - } else if (line[0] == '>' || line[0] == '@') { // fasta /fastq header - // if we aren't on the first entry, push the last sequence into the index - if (entry.name != "") { - mismatchedLineLengths = false; // reset line length error tracker for every new sequence - emptyLine = false; - flushEntryToIndex(entry); - entry.clear(); - } - entry.name = line.substr(1, line_length - 1); - } else { // we assume we have found a sequence line - if (entry.offset == -1) // NB initially the offset is -1 - entry.offset = offset; - entry.length += line_length; - if (entry.line_len) { - //entry.line_len = entry.line_len ? entry.line_len : line_length + 1; - if (mismatchedLineLengths || emptyLine) { - if (line_length == 0) { - emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence - } else { - if (emptyLine) { - cerr << "ERROR: embedded newline"; - } else { - cerr << "ERROR: mismatched line lengths"; - } - cerr << " at line " << line_number << " within sequence " << entry.name << - endl << "File not suitable for fasta index generation." << endl; - exit(1); - } - } - // this flag is set here and checked on the next line - // because we may have reached the end of the sequence, in - // which case a mismatched line length is OK - if (entry.line_len != line_length + 1) { - mismatchedLineLengths = true; - if (line_length == 0) { - emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence - } - } - } else { - entry.line_len = line_length + 1; // first line - } - entry.line_blen = entry.line_len - 1; - } - offset += line_length + 1; - } - // we've hit the end of the fasta file! - // flush the last entry - flushEntryToIndex(entry); - } else { - cerr << "could not open reference file " << refname << " for indexing!" << endl; + // NOTE: can use faidx_set_cache_size to improve performance of repeated queries. + if(faidx == NULL) { + cerr << "Warning: malformed fasta index file " << fname << endl; exit(1); } -} -void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) { - string name = useFullHeader? entry.name : - split(entry.name, " \t").at(0); // key by first token of name - sequenceNames.push_back(name); - this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length, - entry.offset, entry.line_blen, - entry.line_len, - useFullHeader))); -} + struct stat index_attr, fasta_attr; + stat(idx_path, &index_attr); + stat(fname.c_str(), &fasta_attr); + if(fasta_attr.st_mtime > index_attr.st_mtime) { + cerr << "Warning: the index file is older than the FASTA file." << endl; + } -void FastaIndex::writeIndexFile(string fname) { - //cerr << "writing fasta index file " << fname << endl; - ofstream file; - file.open(fname.c_str()); - if (file.is_open()) { - file << *this; - } else { - cerr << "could not open index file " << fname << " for writing!" << endl; - exit(1); + int n_seq = faidx_nseq(faidx); + for(int i = 0; i < n_seq; i++){ + const char* chrom_name = faidx_iseq(faidx, i); + int chrom_len = faidx_seq_len(faidx, chrom_name); // TODO: update to faidx_seq_len64 when htslib is updated. + sequenceNames.push_back(chrom_name); + this->insert(make_pair(chrom_name, FastaIndexEntry(chrom_name, chrom_len))); } } FastaIndex::~FastaIndex(void) { - indexFile.close(); + fai_destroy(faidx); } FastaIndexEntry FastaIndex::entry(string name) { @@ -236,111 +85,57 @@ string FastaIndex::indexFileExtension() { return ".fai"; } void FastaReference::open(string reffilename, bool usemmap, bool useFullHeader) { filename = reffilename; - if (!(file = fopen(filename.c_str(), "r"))) { - cerr << "could not open " << filename << endl; - exit(1); - } + if (useFullHeader) usingfullheader = true; index = new FastaIndex(useFullHeader); - struct stat stFileInfo; - string indexFileName = filename + index->indexFileExtension(); - // if we can find an index file, use it - if(stat(indexFileName.c_str(), &stFileInfo) == 0) { - // check if the index file is older than the FASTA file - struct stat index_attrib, fasta_attrib; - stat(indexFileName.c_str(), &index_attrib); - stat(reffilename.c_str(), &fasta_attrib); - if (fasta_attrib.st_mtime > index_attrib.st_mtime) { - cerr << "Warning: the index file is older than the FASTA file." << endl; - } - index->readIndexFile(indexFileName); - } else { // otherwise, read the reference and generate the index file in the cwd - cerr << "index file " << indexFileName << " not found, generating..." << endl; - index->indexReference(filename); - index->writeIndexFile(indexFileName); - } - if (usemmap) { - usingmmap = true; - int fd = fileno(file); - struct stat sb; - if (fstat(fd, &sb) == -1) - cerr << "could not stat file" << filename << endl; - filesize = sb.st_size; - // map the whole file - filemm = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); - } + index->readIndexFile(reffilename); } FastaReference::~FastaReference(void) { - fclose(file); - if (usingmmap) { - munmap(filemm, filesize); - } delete index; } string FastaReference::getSequence(string seqname) { FastaIndexEntry entry = index->entry(seqname); - CHRPOS newlines_in_sequence = entry.length / entry.line_blen; - CHRPOS seqlen = newlines_in_sequence + entry.length; - char* seq = (char*) calloc (seqlen + 1, sizeof(char)); - if (usingmmap) { - memcpy(seq, (char*) filemm + entry.offset, seqlen); - } else { - fseek64(file, entry.offset, SEEK_SET); - fread(seq, sizeof(char), seqlen, file); - } - seq[seqlen] = '\0'; - char* pbegin = seq; - char* pend = seq + (seqlen/sizeof(char)); - pend = remove(pbegin, pend, '\n'); - pend = remove(pbegin, pend, '\0'); - string s = seq; - free(seq); - s.resize((pend - pbegin)/sizeof(char)); - return s; -} -// TODO cleanup; odd function. use a map -string FastaReference::sequenceNameStartingWith(string seqnameStart) { - try { - return (*index)[seqnameStart].name; - } catch (exception& e) { - cerr << e.what() << ": unable to find index entry for " << seqnameStart << endl; + int len = 0; + // NOTE: could perhaps use string.reserve, then send the pointer to avoid the copy below. + char *seq = fai_fetch(index->faidx, seqname.c_str(), &len); // TODO: update to fai_fetch64 when htslib is updated. + if (len == -1) { + cerr << "error getting sequence:" << seqname << " general error" << endl; exit(1); } + if (len == -2) { + cerr << "error getting sequence:" << seqname << " not found in index" << endl; + exit(1); + } + + string s = seq; + std::free(seq); + return s; } string FastaReference::getSubSequence(string seqname, CHRPOS start, CHRPOS length) { + //cerr << "searching:" << seqname << ":" << start << "-" << start + length << endl; FastaIndexEntry entry = index->entry(seqname); if (start < 0 || length < 1) { cerr << "Error: cannot construct subsequence with negative offset or length < 1" << endl; exit(1); } - // we have to handle newlines - // approach: count newlines before start - // count newlines by end of read - // subtracting newlines before start find count of embedded newlines - CHRPOS newlines_before = start > 0 ? (start - 1) / entry.line_blen : 0; - CHRPOS newlines_by_end = (start + length - 1) / entry.line_blen; - CHRPOS newlines_inside = newlines_by_end - newlines_before; - CHRPOS seqlen = length + newlines_inside; - char* seq = (char*) calloc (seqlen + 1, sizeof(char)); - if (usingmmap) { - memcpy(seq, (char*) filemm + entry.offset + newlines_before + start, seqlen); - } else { - fseek64(file, (off_t) (entry.offset + newlines_before + start), SEEK_SET); - fread(seq, sizeof(char), (off_t) seqlen, file); + int len = 0; + char *seq = faidx_fetch_seq(index->faidx, seqname.c_str(), start, start + length - 1, &len); // TODO: update to fai_fetch_seq64 when htslib is updated. + if (len == -1) { + cerr << "error getting sequence:" << seqname << " general error" << endl; + exit(1); } - seq[seqlen] = '\0'; - char* pbegin = seq; - char* pend = seq + (seqlen/sizeof(char)); - pend = remove(pbegin, pend, '\n'); - pend = remove(pbegin, pend, '\0'); + if (len == -2) { + cerr << "error getting sequence:" << seqname << " not found in index" << endl; + exit(1); + } + string s = seq; - free(seq); - s.resize((pend - pbegin)/sizeof(char)); + std::free(seq); return s; } @@ -351,5 +146,4 @@ CHRPOS FastaReference::sequenceLength(string seqname) { } // 0 length means the chrom wasn't found return 0; -} - +} \ No newline at end of file diff --git a/src/utils/Fasta/Fasta.h b/src/utils/Fasta/Fasta.h index 0a15844c4..a04f078ee 100644 --- a/src/utils/Fasta/Fasta.h +++ b/src/utils/Fasta/Fasta.h @@ -18,6 +18,7 @@ #include #include "LargeFileSupport.h" #include "bedFile.h" +#include "htslib/faidx.h" #include #include #include "split.h" @@ -28,32 +29,25 @@ using namespace std; class FastaIndexEntry { - friend ostream& operator<<(ostream& output, const FastaIndexEntry& e); public: - FastaIndexEntry(string name, CHRPOS length, CHRPOS offset, - CHRPOS line_blen, CHRPOS line_len, bool useFullHeader); + FastaIndexEntry(string name, CHRPOS length); FastaIndexEntry(void); ~FastaIndexEntry(void); string name; // sequence name CHRPOS length; // length of sequence - long long offset; // bytes offset of sequence from start of file - CHRPOS line_blen; // line length in bytes, sequence characters - CHRPOS line_len; // line length including newline - bool useFullHeader; void clear(void); }; class FastaIndex : public map { friend ostream& operator<<(ostream& output, FastaIndex& i); public: + faidx_t *faidx; FastaIndex(bool useFullHeader); ~FastaIndex(void); - bool useFullHeader; + bool useFullHeader; vector sequenceNames; void indexReference(string refName); void readIndexFile(string fname); - void writeIndexFile(string fname); - ifstream indexFile; FastaIndexEntry entry(string key); bool chromFound(string name); void flushEntryToIndex(FastaIndexEntry& entry); @@ -69,8 +63,6 @@ class FastaReference { bool usingfullheader; FastaReference(void) : usingmmap(false), usingfullheader(false) { } ~FastaReference(void); - FILE* file; - void* filemm; size_t filesize; FastaIndex* index; vector findSequencesStartingWith(string seqnameStart); diff --git a/test/getfasta/t.fa.gz b/test/getfasta/t.fa.gz new file mode 100644 index 000000000..ff18329e5 Binary files /dev/null and b/test/getfasta/t.fa.gz differ diff --git a/test/getfasta/t_fH.fa.fai b/test/getfasta/t_fH.fa.fai index 924e70dd6..71055bd5a 100644 --- a/test/getfasta/t_fH.fa.fai +++ b/test/getfasta/t_fH.fa.fai @@ -1 +1 @@ -chr1 assembled by consortium X 50 32 10 11 +chr1 50 32 10 11 diff --git a/test/getfasta/test-getfasta.sh b/test/getfasta/test-getfasta.sh index 60cc4bb53..dac0e9171 100644 --- a/test/getfasta/test-getfasta.sh +++ b/test/getfasta/test-getfasta.sh @@ -32,9 +32,9 @@ fi echo -e " getfasta.t02...\c" LEN=$($BT getfasta -split -fi t.fa -bed blocks.bed | awk '(NR == 2){ print length($0) }') -if [ "$LINES" != "2" ]; then +if [ "$LEN" != "22" ]; then # sizes: 2,10,10 == 22 FAILURES=$(expr $FAILURES + 1); - echo fail + echo fail $LEN else echo ok fi @@ -43,11 +43,12 @@ echo -e " getfasta.t03...\c" SEQ=$($BT getfasta -split -fi t.fa -bed blocks.bed | awk '(NR == 4){ print $0 }') if [ "$SEQ" != "cta" ]; then FAILURES=$(expr $FAILURES + 1); - echo fail + echo fail "got $SEQ" else echo ok fi + # test -fo - echo -e " getfasta.t04...\c" SEQ=$($BT getfasta -split -fi t.fa -bed blocks.bed | awk '(NR == 4){ print $0 }') @@ -71,7 +72,7 @@ fi # test -fullHeader echo -e " getfasta.t06...\c" -LINES=$(echo $'chr1 assembled by consortium X\t1\t10' | $BT getfasta -fullHeader -fi t_fH.fa -bed stdin | awk 'END{ print NR }') +LINES=$(echo $'chr1\t1\t10' | $BT getfasta -fullHeader -fi t_fH.fa -bed stdin | awk 'END{ print NR }') if [ "$LINES" != "2" ]; then FAILURES=$(expr $FAILURES + 1); echo fail @@ -238,4 +239,14 @@ $BT getfasta -fi rna.fasta -bed rna.bed -s -name -rna > obs check obs exp rm obs exp + +echo -e " getfasta.t18...\c" +SEQ=$($BT getfasta -split -fi t.fa.gz -bed blocks.bed | awk '(NR == 4){ print $0 }') +if [ "$SEQ" != "cta" ]; then + FAILURES=$(expr $FAILURES + 1); + echo fail +else + echo ok +fi + [[ $FAILURES -eq 0 ]] || exit 1;