8 #include "bamtools_fasta.h"
10 using namespace BamTools;
12 struct Fasta::FastaPrivate {
14 struct FastaIndexData {
19 int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
30 vector<FastaIndexData> Index;
36 // 'public' API methods
38 bool CreateIndex(const string& indexFilename);
39 bool GetBase(const int& refId, const int& position, char& base);
40 bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
41 bool Open(const string& filename, const string& indexFilename);
45 void Chomp(char* sequence);
46 bool GetNameFromHeader(const string& header, string& name);
47 bool GetNextHeader(string& header);
48 bool GetNextSequence(string& sequence);
49 bool LoadIndexData(void);
51 bool WriteIndexData(void);
54 Fasta::FastaPrivate::FastaPrivate(void)
60 Fasta::FastaPrivate::~FastaPrivate(void) {
64 // remove any trailing newlines
65 void Fasta::FastaPrivate::Chomp(char* sequence) {
67 static const int CHAR_LF = 10;
68 static const int CHAR_CR = 13;
70 size_t seqLength = strlen(sequence);
71 if ( seqLength == 0 ) return;
72 --seqLength; // ignore null terminator
74 while ( sequence[seqLength] == CHAR_LF ||
75 sequence[seqLength] == CHAR_CR
78 sequence[seqLength] = 0;
85 bool Fasta::FastaPrivate::Close(void) {
94 if ( HasIndex && IsIndexOpen ) {
104 bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
106 // check that file is open
108 cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
114 cerr << "FASTA error : could not rewind FASTA file" << endl;
118 // clear out prior index data
121 // -------------------------------------------
122 // calculate lineLength & byteLength
129 if ( fgets(buffer, 1024, Stream) == 0 ) {
130 cerr << "FASTA error : could not read from file" << endl;
133 if ( feof(Stream) ) return false;
134 if ( buffer[0] != '>' ) {
135 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
139 // read in first line of sequence
140 char c = fgetc(Stream);
141 while ( (c >= 0) && (c != '\n') ) {
143 if (isgraph(c)) ++lineLength;
146 ++byteLength; // store newline
150 cerr << "FASTA error : could not rewind FASTA file" << endl;
154 // iterate through fasta entries
157 string sequence = "";
158 while ( GetNextHeader(header) ) {
160 // build index entry data
162 GetNameFromHeader(header, data.Name);
163 data.Offset = ftello(Stream);
165 GetNextSequence(sequence);
166 data.Length = sequence.length();
167 data.LineLength = lineLength;
168 data.ByteLength = byteLength;
171 Index.push_back(data);
178 if ( !indexFilename.empty() ) {
179 IndexStream = fopen64(indexFilename.c_str(), "wb");
180 if ( !IndexStream ) {
181 cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
188 if ( !WriteIndexData() ) return false;
195 // return succes status
199 bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
201 // make sure FASTA file is open
203 cerr << "FASTA error : file not open for reading" << endl;
207 // use index if available
208 if ( HasIndex && !Index.empty() ) {
210 // validate reference id
211 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
212 cerr << "FASTA error: invalid refId specified: " << refId << endl;
216 // retrieve reference index data
217 const FastaIndexData& referenceData = Index.at(refId);
220 if ( (position < 0) || (position > referenceData.Length) ) {
221 cerr << "FASTA error: invalid position specified: " << position << endl;
225 // seek to beginning of sequence data
226 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
227 cerr << "FASTA error : could not sek in file" << endl;
232 string sequence = "";
233 if ( !GetNextSequence(sequence) ) {
234 cerr << "FASTA error : could not retrieve base from FASTA file" << endl;
238 // set base & return success
239 base = sequence.at(position);
243 // else plow through sequentially
248 cerr << "FASTA error : could not rewind FASTA file" << endl;
252 // iterate through fasta entries
255 string sequence = "";
258 GetNextHeader(header);
259 GetNextSequence(sequence);
261 while ( currentId != refId ) {
262 GetNextHeader(header);
263 GetNextSequence(sequence);
267 // get desired base from sequence
268 // TODO: error reporting on invalid position
269 if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
270 base = sequence.at(position);
274 // could not get sequence
282 bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
284 // get rid of the leading greater than sign
285 string s = header.substr(1);
287 // extract the first non-whitespace segment
288 char* pName = (char*)s.data();
289 unsigned int nameLen = (unsigned int)s.size();
291 unsigned int start = 0;
292 while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
294 if ( start == nameLen )
298 unsigned int stop = start;
299 if ( stop < nameLen ) {
300 while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
302 if ( stop == nameLen )
307 if ( start == stop ) {
308 cout << "FASTA error : could not parse read name from FASTA header." << endl;
312 name = s.substr(start, stop - start).c_str();
316 bool Fasta::FastaPrivate::GetNextHeader(string& header) {
318 // validate input stream
319 if ( !IsOpen || feof(Stream) )
322 // read in header line
324 if ( fgets(buffer, 1024, Stream) == 0 ) {
325 cerr << "FASTA error : could not read from file" << endl;
329 // make sure it's a FASTA header
330 if ( buffer[0] != '>' ) {
331 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
335 // import buffer contents to header string
336 stringstream headerBuffer("");
337 headerBuffer << buffer;
338 header = headerBuffer.str();
344 bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
346 // validate input stream
347 if ( !IsOpen || feof(Stream) )
352 ostringstream seqBuffer("");
355 char ch = fgetc(Stream);
357 if( (ch == '>') || feof(Stream) )
360 if ( fgets(buffer, 1024, Stream) == 0 ) {
361 cerr << "FASTA error : could not read from file" << endl;
369 // import buffer contents to sequence string
370 sequence = seqBuffer.str();
376 bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
378 // make sure FASTA file is open
380 cerr << "FASTA error : file not open for reading" << endl;
384 // use index if available
385 if ( HasIndex && !Index.empty() ) {
387 // validate reference id
388 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
389 cerr << "FASTA error: invalid refId specified: " << refId << endl;
393 // retrieve reference index data
394 const FastaIndexData& referenceData = Index.at(refId);
396 // validate stop position
397 if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
398 cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
402 // seek to beginning of sequence data
403 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
404 cerr << "FASTA error : could not sek in file" << endl;
408 // retrieve full sequence
409 string fullSequence = "";
410 if ( !GetNextSequence(fullSequence) ) {
411 cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
415 // set sub-sequence & return success
416 const int seqLength = (stop - start) + 1;
417 sequence = fullSequence.substr(start, seqLength);
421 // else plow through sequentially
426 cerr << "FASTA error : could not rewind FASTA file" << endl;
430 // iterate through fasta entries
433 string fullSequence = "";
436 GetNextHeader(header);
437 GetNextSequence(fullSequence);
439 while ( currentId != refId ) {
440 GetNextHeader(header);
441 GetNextSequence(fullSequence);
445 // get desired substring from sequence
446 // TODO: error reporting on invalid start/stop positions
447 if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
448 const int seqLength = (stop - start) + 1;
449 sequence = fullSequence.substr(start, seqLength);
453 // could not get sequence
461 bool Fasta::FastaPrivate::LoadIndexData(void) {
463 // skip if no index file available
464 if ( !IsIndexOpen ) return false;
466 // clear any prior index data
470 stringstream indexBuffer;
473 char c = fgetc(IndexStream);
474 if ( (c == '\n') || feof(IndexStream) ) break;
475 ungetc(c, IndexStream);
477 // clear index buffer
480 // read line from index file
481 if ( fgets(buffer, 1024, IndexStream) == 0 ) {
482 cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
487 // store line in indexBuffer
488 indexBuffer << buffer;
490 // retrieve fasta index data from line
492 indexBuffer >> data.Name;
493 indexBuffer >> data.Length;
494 indexBuffer >> data.Offset;
495 indexBuffer >> data.LineLength;
496 indexBuffer >> data.ByteLength;
499 Index.push_back(data);
505 bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
509 // open FASTA filename
510 Stream = fopen64(filename.c_str(), "rb");
512 cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
518 // open index file if it exists
519 if ( !indexFilename.empty() ) {
520 IndexStream = fopen64(indexFilename.c_str(), "rb");
521 if ( !IndexStream ) {
522 cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
526 success &= IsIndexOpen;
528 // attempt to load index data
529 HasIndex = LoadIndexData();
533 // return success status
537 bool Fasta::FastaPrivate::Rewind(void) {
538 if ( !IsOpen ) return false;
539 return ( fseeko(Stream, 0, SEEK_SET) == 0 );
542 bool Fasta::FastaPrivate::WriteIndexData(void) {
544 // skip if no index file available
545 if ( !IsIndexOpen ) return false;
547 // iterate over index entries
549 stringstream indexBuffer;
550 vector<FastaIndexData>::const_iterator indexIter = Index.begin();
551 vector<FastaIndexData>::const_iterator indexEnd = Index.end();
552 for ( ; indexIter != indexEnd; ++indexIter ) {
557 // write data to stream
558 const FastaIndexData& data = (*indexIter);
559 indexBuffer << data.Name << "\t"
560 << data.Length << "\t"
561 << data.Offset << "\t"
562 << data.LineLength << "\t"
563 << data.ByteLength << endl;
565 // write stream to file
566 success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
569 // return success status
573 // --------------------------------
574 // Fasta implementation
577 d = new FastaPrivate;
580 Fasta::~Fasta(void) {
585 bool Fasta::Close(void) {
589 bool Fasta::CreateIndex(const string& indexFilename) {
590 return d->CreateIndex(indexFilename);
593 bool Fasta::GetBase(const int& refId, const int& position, char& base) {
594 return d->GetBase(refId, position, base);
597 bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
598 return d->GetSequence(refId, start, stop, sequence);
601 bool Fasta::Open(const string& filename, const string& indexFilename) {
602 return d->Open(filename, indexFilename);