From: Derek Date: Tue, 13 Jul 2010 01:57:40 +0000 (-0400) Subject: Added support for reading FASTA sequences, as well as generating FASTA index (.fai... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=ce1a192b46d5d84c7d6c9c81c868dbaf46d79edd;p=bamtools.git Added support for reading FASTA sequences, as well as generating FASTA index (.fai) files. TODO: need to drop FASTA functionality into pileup conversion, as well as create command line feature to generate FASTA indices --- diff --git a/bamtools_fasta.cpp b/bamtools_fasta.cpp new file mode 100644 index 0000000..74cde03 --- /dev/null +++ b/bamtools_fasta.cpp @@ -0,0 +1,603 @@ +#include +#include +#include +#include +#include +#include +#include +#include "bamtools_fasta.h" +using namespace std; +using namespace BamTools; + +struct Fasta::FastaPrivate { + + struct FastaIndexData { + string Name; + int32_t Length; + int64_t Offset; + int32_t LineLength; + int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated + }; + + // data members + FILE* Stream; + bool IsOpen; + + FILE* IndexStream; + bool HasIndex; + bool IsIndexOpen; + + vector Index; + + // ctor + FastaPrivate(void); + ~FastaPrivate(void); + + // 'public' API methods + bool Close(void); + bool CreateIndex(const string& indexFilename); + bool GetBase(const int& refId, const int& position, char& base); + bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence); + bool Open(const string& filename, const string& indexFilename); + + // internal methods + private: + void Chomp(char* sequence); + bool GetNameFromHeader(const string& header, string& name); + bool GetNextHeader(string& header); + bool GetNextSequence(string& sequence); + bool LoadIndexData(void); + bool Rewind(void); + bool WriteIndexData(void); +}; + +Fasta::FastaPrivate::FastaPrivate(void) + : IsOpen(false) + , HasIndex(false) + , IsIndexOpen(false) +{ } + +Fasta::FastaPrivate::~FastaPrivate(void) { + Close(); +} + +// remove any trailing newlines +void Fasta::FastaPrivate::Chomp(char* sequence) { + + static const int CHAR_LF = 10; + static const int CHAR_CR = 13; + + size_t seqLength = strlen(sequence); + if ( seqLength == 0 ) return; + --seqLength; // ignore null terminator + + while ( sequence[seqLength] == CHAR_LF || + sequence[seqLength] == CHAR_CR + ) + { + sequence[seqLength] = 0; + --seqLength; + if (seqLength < 0) + break; + } +} + +bool Fasta::FastaPrivate::Close(void) { + + // close fasta file + if ( IsOpen ) { + fclose(Stream); + IsOpen = false; + } + + // close index file + if ( HasIndex && IsIndexOpen ) { + fclose(IndexStream); + HasIndex = false; + IsIndexOpen = false; + } + + // return success + return true; +} + +bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) { + + // check that file is open + if ( !IsOpen ) { + cerr << "FASTA error : cannot create index, FASTA file not open" << endl; + return false; + } + + // rewind FASTA file + if ( !Rewind() ) { + cerr << "FASTA error : could not rewind FASTA file" << endl; + return false; + } + + // clear out prior index data + Index.clear(); + + // ------------------------------------------- + // calculate lineLength & byteLength + + int lineLength = 0; + int byteLength = 0; + + // skip over header + char buffer[1024]; + if ( fgets(buffer, 1024, Stream) == 0 ) { + cerr << "FASTA error : could not read from file" << endl; + return false; + } + if ( feof(Stream) ) return false; + if ( buffer[0] != '>' ) { + cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl; + return false; + } + + // read in first line of sequence + char c = fgetc(Stream); + while ( (c >= 0) && (c != '\n') ) { + ++byteLength; + if (isgraph(c)) ++lineLength; + c = fgetc(Stream); + } + ++byteLength; // store newline + + // rewind FASTA file + if ( !Rewind() ) { + cerr << "FASTA error : could not rewind FASTA file" << endl; + return false; + } + + // iterate through fasta entries + int currentId = 0; + string header = ""; + string sequence = ""; + while ( GetNextHeader(header) ) { + + // build index entry data + FastaIndexData data; + GetNameFromHeader(header, data.Name); + data.Offset = ftello(Stream); + + GetNextSequence(sequence); + data.Length = sequence.length(); + data.LineLength = lineLength; + data.ByteLength = byteLength; + + // store index entry + Index.push_back(data); + + // update ref Id + ++currentId; + } + + // open index file + if ( !indexFilename.empty() ) { + IndexStream = fopen64(indexFilename.c_str(), "wb"); + if ( !IndexStream ) { + cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl; + return false; + } + IsIndexOpen = true; + } + + // write index data + if ( !WriteIndexData() ) return false; + HasIndex = true; + + // close index file + fclose(IndexStream); + IsIndexOpen = false; + + // return succes status + return true; +} + +bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) { + + // make sure FASTA file is open + if ( !IsOpen ) { + cerr << "FASTA error : file not open for reading" << endl; + return false; + } + + // use index if available + if ( HasIndex && !Index.empty() ) { + + // validate reference id + if ( (refId < 0) || (refId >= (int)Index.size()) ) { + cerr << "FASTA error: invalid refId specified: " << refId << endl; + return false; + } + + // retrieve reference index data + const FastaIndexData& referenceData = Index.at(refId); + + // validate position + if ( (position < 0) || (position > referenceData.Length) ) { + cerr << "FASTA error: invalid position specified: " << position << endl; + return false; + } + + // seek to beginning of sequence data + if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) { + cerr << "FASTA error : could not sek in file" << endl; + return false; + } + + // retrieve sequence + string sequence = ""; + if ( !GetNextSequence(sequence) ) { + cerr << "FASTA error : could not retrieve base from FASTA file" << endl; + return false; + } + + // set base & return success + base = sequence.at(position); + return true; + } + + // else plow through sequentially + else { + + // rewind FASTA file + if ( !Rewind() ) { + cerr << "FASTA error : could not rewind FASTA file" << endl; + return false; + } + + // iterate through fasta entries + int currentId = 0; + string header = ""; + string sequence = ""; + + // get first entry + GetNextHeader(header); + GetNextSequence(sequence); + + while ( currentId != refId ) { + GetNextHeader(header); + GetNextSequence(sequence); + ++currentId; + } + + // get desired base from sequence + // TODO: error reporting on invalid position + if ( currentId == refId && (sequence.length() >= (size_t)position) ) { + base = sequence.at(position); + return true; + } + + // could not get sequence + return false; + } + + // return success + return true; +} + +bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) { + + // get rid of the leading greater than sign + string s = header.substr(1); + + // extract the first non-whitespace segment + char* pName = (char*)s.data(); + unsigned int nameLen = (unsigned int)s.size(); + + unsigned int start = 0; + while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) { + start++; + if ( start == nameLen ) + break; + } + + unsigned int stop = start; + if ( stop < nameLen ) { + while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) { + stop++; + if ( stop == nameLen ) + break; + } + } + + if ( start == stop ) { + cout << "FASTA error : could not parse read name from FASTA header." << endl; + return false; + } + + name = s.substr(start, stop - start).c_str(); + return true; +} + +bool Fasta::FastaPrivate::GetNextHeader(string& header) { + + // validate input stream + if ( !IsOpen || feof(Stream) ) + return false; + + // read in header line + char buffer[1024]; + if ( fgets(buffer, 1024, Stream) == 0 ) { + cerr << "FASTA error : could not read from file" << endl; + return false; + } + + // make sure it's a FASTA header + if ( buffer[0] != '>' ) { + cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl; + return false; + } + + // import buffer contents to header string + stringstream headerBuffer(""); + headerBuffer << buffer; + header = headerBuffer.str(); + + // return success + return true; +} + +bool Fasta::FastaPrivate::GetNextSequence(string& sequence) { + + // validate input stream + if ( !IsOpen || feof(Stream) ) + return false; + + // read in sequence + char buffer[1024]; + ostringstream seqBuffer(""); + while(true) { + + char ch = fgetc(Stream); + ungetc(ch, Stream); + if( (ch == '>') || feof(Stream) ) + break; + + if ( fgets(buffer, 1024, Stream) == 0 ) { + cerr << "FASTA error : could not read from file" << endl; + return false; + } + + Chomp(buffer); + seqBuffer << buffer; + } + + // import buffer contents to sequence string + sequence = seqBuffer.str(); + + // return success + return true; +} + +bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) { + + // make sure FASTA file is open + if ( !IsOpen ) { + cerr << "FASTA error : file not open for reading" << endl; + return false; + } + + // use index if available + if ( HasIndex && !Index.empty() ) { + + // validate reference id + if ( (refId < 0) || (refId >= (int)Index.size()) ) { + cerr << "FASTA error: invalid refId specified: " << refId << endl; + return false; + } + + // retrieve reference index data + const FastaIndexData& referenceData = Index.at(refId); + + // validate stop position + if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) { + cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl; + return false; + } + + // seek to beginning of sequence data + if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) { + cerr << "FASTA error : could not sek in file" << endl; + return false; + } + + // retrieve full sequence + string fullSequence = ""; + if ( !GetNextSequence(fullSequence) ) { + cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl; + return false; + } + + // set sub-sequence & return success + const int seqLength = (stop - start) + 1; + sequence = fullSequence.substr(start, seqLength); + return true; + } + + // else plow through sequentially + else { + + // rewind FASTA file + if ( !Rewind() ) { + cerr << "FASTA error : could not rewind FASTA file" << endl; + return false; + } + + // iterate through fasta entries + int currentId = 0; + string header = ""; + string fullSequence = ""; + + // get first entry + GetNextHeader(header); + GetNextSequence(fullSequence); + + while ( currentId != refId ) { + GetNextHeader(header); + GetNextSequence(fullSequence); + ++currentId; + } + + // get desired substring from sequence + // TODO: error reporting on invalid start/stop positions + if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) { + const int seqLength = (stop - start) + 1; + sequence = fullSequence.substr(start, seqLength); + return true; + } + + // could not get sequence + return false; + } + + // return success + return true; +} + +bool Fasta::FastaPrivate::LoadIndexData(void) { + + // skip if no index file available + if ( !IsIndexOpen ) return false; + + // clear any prior index data + Index.clear(); + + char buffer[1024]; + stringstream indexBuffer; + while ( true ) { + + char c = fgetc(IndexStream); + if ( (c == '\n') || feof(IndexStream) ) break; + ungetc(c, IndexStream); + + // clear index buffer + indexBuffer.str(""); + + // read line from index file + if ( fgets(buffer, 1024, IndexStream) == 0 ) { + cerr << "FASTA LoadIndexData() error : could not read from index file" << endl; + HasIndex = false; + return false; + } + + // store line in indexBuffer + indexBuffer << buffer; + + // retrieve fasta index data from line + FastaIndexData data; + indexBuffer >> data.Name; + indexBuffer >> data.Length; + indexBuffer >> data.Offset; + indexBuffer >> data.LineLength; + indexBuffer >> data.ByteLength; + + // store index entry + Index.push_back(data); + } + + return true; +} + +bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) { + + bool success = true; + + // open FASTA filename + Stream = fopen64(filename.c_str(), "rb"); + if ( !Stream ) { + cerr << "FASTA error: Could not open " << filename << " for reading" << endl; + return false; + } + IsOpen = true; + success &= IsOpen; + + // open index file if it exists + if ( !indexFilename.empty() ) { + IndexStream = fopen64(indexFilename.c_str(), "rb"); + if ( !IndexStream ) { + cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl; + return false; + } + IsIndexOpen = true; + success &= IsIndexOpen; + + // attempt to load index data + HasIndex = LoadIndexData(); + success &= HasIndex; + } + + // return success status + return success; +} + +bool Fasta::FastaPrivate::Rewind(void) { + if ( !IsOpen ) return false; + return ( fseeko(Stream, 0, SEEK_SET) == 0 ); +} + +bool Fasta::FastaPrivate::WriteIndexData(void) { + + // skip if no index file available + if ( !IsIndexOpen ) return false; + + // iterate over index entries + bool success = true; + stringstream indexBuffer; + vector::const_iterator indexIter = Index.begin(); + vector::const_iterator indexEnd = Index.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + + // clear stream + indexBuffer.str(""); + + // write data to stream + const FastaIndexData& data = (*indexIter); + indexBuffer << data.Name << "\t" + << data.Length << "\t" + << data.Offset << "\t" + << data.LineLength << "\t" + << data.ByteLength << endl; + + // write stream to file + success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 ); + } + + // return success status + return success; +} + +// -------------------------------- +// Fasta implementation + +Fasta::Fasta(void) { + d = new FastaPrivate; +} + +Fasta::~Fasta(void) { + delete d; + d = 0; +} + +bool Fasta::Close(void) { + return d->Close(); +} + +bool Fasta::CreateIndex(const string& indexFilename) { + return d->CreateIndex(indexFilename); +} + +bool Fasta::GetBase(const int& refId, const int& position, char& base) { + return d->GetBase(refId, position, base); +} + +bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) { + return d->GetSequence(refId, start, stop, sequence); +} + +bool Fasta::Open(const string& filename, const string& indexFilename) { + return d->Open(filename, indexFilename); +} \ No newline at end of file diff --git a/bamtools_fasta.h b/bamtools_fasta.h new file mode 100644 index 0000000..43c262b --- /dev/null +++ b/bamtools_fasta.h @@ -0,0 +1,37 @@ +#ifndef BAMTOOLS_FASTA_H +#define BAMTOOLS_FASTA_H + +#include + +namespace BamTools { + +class Fasta { + + // ctor & dtor + public: + Fasta(void); + ~Fasta(void); + + // file-handling methods + public: + bool Close(void); + bool Open(const std::string& filename, const std::string& indexFilename = ""); + + // sequence access methods + public: + bool GetBase(const int& refID, const int& position, char& base); + bool GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence); + + // index-handling methods + public: + bool CreateIndex(const std::string& indexFilename); + + // internal implementation + private: + struct FastaPrivate; + FastaPrivate* d; +}; + +} // BAMTOOLS_FASTA_H + +#endif // BAMTOOLS_FASTA_H \ No newline at end of file