--- /dev/null
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "bamtools_fasta.h"
+using namespace std;
+using namespace BamTools;
+
+struct Fasta::FastaPrivate {
+
+ struct FastaIndexData {
+ string Name;
+ int32_t Length;
+ int64_t Offset;
+ int32_t LineLength;
+ int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
+ };
+
+ // data members
+ FILE* Stream;
+ bool IsOpen;
+
+ FILE* IndexStream;
+ bool HasIndex;
+ bool IsIndexOpen;
+
+ vector<FastaIndexData> Index;
+
+ // ctor
+ FastaPrivate(void);
+ ~FastaPrivate(void);
+
+ // 'public' API methods
+ bool Close(void);
+ bool CreateIndex(const string& indexFilename);
+ bool GetBase(const int& refId, const int& position, char& base);
+ bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
+ bool Open(const string& filename, const string& indexFilename);
+
+ // internal methods
+ private:
+ void Chomp(char* sequence);
+ bool GetNameFromHeader(const string& header, string& name);
+ bool GetNextHeader(string& header);
+ bool GetNextSequence(string& sequence);
+ bool LoadIndexData(void);
+ bool Rewind(void);
+ bool WriteIndexData(void);
+};
+
+Fasta::FastaPrivate::FastaPrivate(void)
+ : IsOpen(false)
+ , HasIndex(false)
+ , IsIndexOpen(false)
+{ }
+
+Fasta::FastaPrivate::~FastaPrivate(void) {
+ Close();
+}
+
+// remove any trailing newlines
+void Fasta::FastaPrivate::Chomp(char* sequence) {
+
+ static const int CHAR_LF = 10;
+ static const int CHAR_CR = 13;
+
+ size_t seqLength = strlen(sequence);
+ if ( seqLength == 0 ) return;
+ --seqLength; // ignore null terminator
+
+ while ( sequence[seqLength] == CHAR_LF ||
+ sequence[seqLength] == CHAR_CR
+ )
+ {
+ sequence[seqLength] = 0;
+ --seqLength;
+ if (seqLength < 0)
+ break;
+ }
+}
+
+bool Fasta::FastaPrivate::Close(void) {
+
+ // close fasta file
+ if ( IsOpen ) {
+ fclose(Stream);
+ IsOpen = false;
+ }
+
+ // close index file
+ if ( HasIndex && IsIndexOpen ) {
+ fclose(IndexStream);
+ HasIndex = false;
+ IsIndexOpen = false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
+
+ // check that file is open
+ if ( !IsOpen ) {
+ cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
+ return false;
+ }
+
+ // rewind FASTA file
+ if ( !Rewind() ) {
+ cerr << "FASTA error : could not rewind FASTA file" << endl;
+ return false;
+ }
+
+ // clear out prior index data
+ Index.clear();
+
+ // -------------------------------------------
+ // calculate lineLength & byteLength
+
+ int lineLength = 0;
+ int byteLength = 0;
+
+ // skip over header
+ char buffer[1024];
+ if ( fgets(buffer, 1024, Stream) == 0 ) {
+ cerr << "FASTA error : could not read from file" << endl;
+ return false;
+ }
+ if ( feof(Stream) ) return false;
+ if ( buffer[0] != '>' ) {
+ cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
+ return false;
+ }
+
+ // read in first line of sequence
+ char c = fgetc(Stream);
+ while ( (c >= 0) && (c != '\n') ) {
+ ++byteLength;
+ if (isgraph(c)) ++lineLength;
+ c = fgetc(Stream);
+ }
+ ++byteLength; // store newline
+
+ // rewind FASTA file
+ if ( !Rewind() ) {
+ cerr << "FASTA error : could not rewind FASTA file" << endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ string header = "";
+ string sequence = "";
+ while ( GetNextHeader(header) ) {
+
+ // build index entry data
+ FastaIndexData data;
+ GetNameFromHeader(header, data.Name);
+ data.Offset = ftello(Stream);
+
+ GetNextSequence(sequence);
+ data.Length = sequence.length();
+ data.LineLength = lineLength;
+ data.ByteLength = byteLength;
+
+ // store index entry
+ Index.push_back(data);
+
+ // update ref Id
+ ++currentId;
+ }
+
+ // open index file
+ if ( !indexFilename.empty() ) {
+ IndexStream = fopen64(indexFilename.c_str(), "wb");
+ if ( !IndexStream ) {
+ cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
+ return false;
+ }
+ IsIndexOpen = true;
+ }
+
+ // write index data
+ if ( !WriteIndexData() ) return false;
+ HasIndex = true;
+
+ // close index file
+ fclose(IndexStream);
+ IsIndexOpen = false;
+
+ // return succes status
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
+
+ // make sure FASTA file is open
+ if ( !IsOpen ) {
+ cerr << "FASTA error : file not open for reading" << endl;
+ return false;
+ }
+
+ // use index if available
+ if ( HasIndex && !Index.empty() ) {
+
+ // validate reference id
+ if ( (refId < 0) || (refId >= (int)Index.size()) ) {
+ cerr << "FASTA error: invalid refId specified: " << refId << endl;
+ return false;
+ }
+
+ // retrieve reference index data
+ const FastaIndexData& referenceData = Index.at(refId);
+
+ // validate position
+ if ( (position < 0) || (position > referenceData.Length) ) {
+ cerr << "FASTA error: invalid position specified: " << position << endl;
+ return false;
+ }
+
+ // seek to beginning of sequence data
+ if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
+ cerr << "FASTA error : could not sek in file" << endl;
+ return false;
+ }
+
+ // retrieve sequence
+ string sequence = "";
+ if ( !GetNextSequence(sequence) ) {
+ cerr << "FASTA error : could not retrieve base from FASTA file" << endl;
+ return false;
+ }
+
+ // set base & return success
+ base = sequence.at(position);
+ return true;
+ }
+
+ // else plow through sequentially
+ else {
+
+ // rewind FASTA file
+ if ( !Rewind() ) {
+ cerr << "FASTA error : could not rewind FASTA file" << endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ string header = "";
+ string sequence = "";
+
+ // get first entry
+ GetNextHeader(header);
+ GetNextSequence(sequence);
+
+ while ( currentId != refId ) {
+ GetNextHeader(header);
+ GetNextSequence(sequence);
+ ++currentId;
+ }
+
+ // get desired base from sequence
+ // TODO: error reporting on invalid position
+ if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
+ base = sequence.at(position);
+ return true;
+ }
+
+ // could not get sequence
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
+
+ // get rid of the leading greater than sign
+ string s = header.substr(1);
+
+ // extract the first non-whitespace segment
+ char* pName = (char*)s.data();
+ unsigned int nameLen = (unsigned int)s.size();
+
+ unsigned int start = 0;
+ while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
+ start++;
+ if ( start == nameLen )
+ break;
+ }
+
+ unsigned int stop = start;
+ if ( stop < nameLen ) {
+ while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
+ stop++;
+ if ( stop == nameLen )
+ break;
+ }
+ }
+
+ if ( start == stop ) {
+ cout << "FASTA error : could not parse read name from FASTA header." << endl;
+ return false;
+ }
+
+ name = s.substr(start, stop - start).c_str();
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNextHeader(string& header) {
+
+ // validate input stream
+ if ( !IsOpen || feof(Stream) )
+ return false;
+
+ // read in header line
+ char buffer[1024];
+ if ( fgets(buffer, 1024, Stream) == 0 ) {
+ cerr << "FASTA error : could not read from file" << endl;
+ return false;
+ }
+
+ // make sure it's a FASTA header
+ if ( buffer[0] != '>' ) {
+ cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
+ return false;
+ }
+
+ // import buffer contents to header string
+ stringstream headerBuffer("");
+ headerBuffer << buffer;
+ header = headerBuffer.str();
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
+
+ // validate input stream
+ if ( !IsOpen || feof(Stream) )
+ return false;
+
+ // read in sequence
+ char buffer[1024];
+ ostringstream seqBuffer("");
+ while(true) {
+
+ char ch = fgetc(Stream);
+ ungetc(ch, Stream);
+ if( (ch == '>') || feof(Stream) )
+ break;
+
+ if ( fgets(buffer, 1024, Stream) == 0 ) {
+ cerr << "FASTA error : could not read from file" << endl;
+ return false;
+ }
+
+ Chomp(buffer);
+ seqBuffer << buffer;
+ }
+
+ // import buffer contents to sequence string
+ sequence = seqBuffer.str();
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
+
+ // make sure FASTA file is open
+ if ( !IsOpen ) {
+ cerr << "FASTA error : file not open for reading" << endl;
+ return false;
+ }
+
+ // use index if available
+ if ( HasIndex && !Index.empty() ) {
+
+ // validate reference id
+ if ( (refId < 0) || (refId >= (int)Index.size()) ) {
+ cerr << "FASTA error: invalid refId specified: " << refId << endl;
+ return false;
+ }
+
+ // retrieve reference index data
+ const FastaIndexData& referenceData = Index.at(refId);
+
+ // validate stop position
+ if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
+ cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
+ return false;
+ }
+
+ // seek to beginning of sequence data
+ if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
+ cerr << "FASTA error : could not sek in file" << endl;
+ return false;
+ }
+
+ // retrieve full sequence
+ string fullSequence = "";
+ if ( !GetNextSequence(fullSequence) ) {
+ cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
+ return false;
+ }
+
+ // set sub-sequence & return success
+ const int seqLength = (stop - start) + 1;
+ sequence = fullSequence.substr(start, seqLength);
+ return true;
+ }
+
+ // else plow through sequentially
+ else {
+
+ // rewind FASTA file
+ if ( !Rewind() ) {
+ cerr << "FASTA error : could not rewind FASTA file" << endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ string header = "";
+ string fullSequence = "";
+
+ // get first entry
+ GetNextHeader(header);
+ GetNextSequence(fullSequence);
+
+ while ( currentId != refId ) {
+ GetNextHeader(header);
+ GetNextSequence(fullSequence);
+ ++currentId;
+ }
+
+ // get desired substring from sequence
+ // TODO: error reporting on invalid start/stop positions
+ if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
+ const int seqLength = (stop - start) + 1;
+ sequence = fullSequence.substr(start, seqLength);
+ return true;
+ }
+
+ // could not get sequence
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::LoadIndexData(void) {
+
+ // skip if no index file available
+ if ( !IsIndexOpen ) return false;
+
+ // clear any prior index data
+ Index.clear();
+
+ char buffer[1024];
+ stringstream indexBuffer;
+ while ( true ) {
+
+ char c = fgetc(IndexStream);
+ if ( (c == '\n') || feof(IndexStream) ) break;
+ ungetc(c, IndexStream);
+
+ // clear index buffer
+ indexBuffer.str("");
+
+ // read line from index file
+ if ( fgets(buffer, 1024, IndexStream) == 0 ) {
+ cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
+ HasIndex = false;
+ return false;
+ }
+
+ // store line in indexBuffer
+ indexBuffer << buffer;
+
+ // retrieve fasta index data from line
+ FastaIndexData data;
+ indexBuffer >> data.Name;
+ indexBuffer >> data.Length;
+ indexBuffer >> data.Offset;
+ indexBuffer >> data.LineLength;
+ indexBuffer >> data.ByteLength;
+
+ // store index entry
+ Index.push_back(data);
+ }
+
+ return true;
+}
+
+bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
+
+ bool success = true;
+
+ // open FASTA filename
+ Stream = fopen64(filename.c_str(), "rb");
+ if ( !Stream ) {
+ cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
+ return false;
+ }
+ IsOpen = true;
+ success &= IsOpen;
+
+ // open index file if it exists
+ if ( !indexFilename.empty() ) {
+ IndexStream = fopen64(indexFilename.c_str(), "rb");
+ if ( !IndexStream ) {
+ cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
+ return false;
+ }
+ IsIndexOpen = true;
+ success &= IsIndexOpen;
+
+ // attempt to load index data
+ HasIndex = LoadIndexData();
+ success &= HasIndex;
+ }
+
+ // return success status
+ return success;
+}
+
+bool Fasta::FastaPrivate::Rewind(void) {
+ if ( !IsOpen ) return false;
+ return ( fseeko(Stream, 0, SEEK_SET) == 0 );
+}
+
+bool Fasta::FastaPrivate::WriteIndexData(void) {
+
+ // skip if no index file available
+ if ( !IsIndexOpen ) return false;
+
+ // iterate over index entries
+ bool success = true;
+ stringstream indexBuffer;
+ vector<FastaIndexData>::const_iterator indexIter = Index.begin();
+ vector<FastaIndexData>::const_iterator indexEnd = Index.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+
+ // clear stream
+ indexBuffer.str("");
+
+ // write data to stream
+ const FastaIndexData& data = (*indexIter);
+ indexBuffer << data.Name << "\t"
+ << data.Length << "\t"
+ << data.Offset << "\t"
+ << data.LineLength << "\t"
+ << data.ByteLength << endl;
+
+ // write stream to file
+ success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
+ }
+
+ // return success status
+ return success;
+}
+
+// --------------------------------
+// Fasta implementation
+
+Fasta::Fasta(void) {
+ d = new FastaPrivate;
+}
+
+Fasta::~Fasta(void) {
+ delete d;
+ d = 0;
+}
+
+bool Fasta::Close(void) {
+ return d->Close();
+}
+
+bool Fasta::CreateIndex(const string& indexFilename) {
+ return d->CreateIndex(indexFilename);
+}
+
+bool Fasta::GetBase(const int& refId, const int& position, char& base) {
+ return d->GetBase(refId, position, base);
+}
+
+bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
+ return d->GetSequence(refId, start, stop, sequence);
+}
+
+bool Fasta::Open(const string& filename, const string& indexFilename) {
+ return d->Open(filename, indexFilename);
+}
\ No newline at end of file