1 // ***************************************************************************
2 // bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 9 March 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides FASTA reading/indexing functionality.
8 // ***************************************************************************
10 #include "utils/bamtools_fasta.h"
11 using namespace BamTools;
22 struct Fasta::FastaPrivate {
24 struct FastaIndexData {
29 int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
40 vector<FastaIndexData> Index;
46 // 'public' API methods
48 bool CreateIndex(const string& indexFilename);
49 bool GetBase(const int& refId, const int& position, char& base);
50 bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
51 bool Open(const string& filename, const string& indexFilename);
55 void Chomp(char* sequence);
56 bool GetNameFromHeader(const string& header, string& name);
57 bool GetNextHeader(string& header);
58 bool GetNextSequence(string& sequence);
59 bool LoadIndexData(void);
61 bool WriteIndexData(void);
64 Fasta::FastaPrivate::FastaPrivate(void)
70 Fasta::FastaPrivate::~FastaPrivate(void) {
74 // remove any trailing newlines
75 void Fasta::FastaPrivate::Chomp(char* sequence) {
77 static const int CHAR_LF = 10;
78 static const int CHAR_CR = 13;
80 size_t seqLength = strlen(sequence);
81 if ( seqLength == 0 ) return;
82 --seqLength; // ignore null terminator
84 while ( sequence[seqLength] == CHAR_LF ||
85 sequence[seqLength] == CHAR_CR
88 sequence[seqLength] = 0;
95 bool Fasta::FastaPrivate::Close(void) {
104 if ( HasIndex && IsIndexOpen ) {
114 bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
116 // check that file is open
118 cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
124 cerr << "FASTA error : could not rewind FASTA file" << endl;
128 // clear out prior index data
131 // -------------------------------------------
132 // calculate lineLength & byteLength
139 if ( fgets(buffer, 1024, Stream) == 0 ) {
140 cerr << "FASTA error : could not read from file" << endl;
143 if ( feof(Stream) ) return false;
144 if ( buffer[0] != '>' ) {
145 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
149 // read in first line of sequence
150 char c = fgetc(Stream);
151 while ( (c >= 0) && (c != '\n') ) {
153 if (isgraph(c)) ++lineLength;
156 ++byteLength; // store newline
160 cerr << "FASTA error : could not rewind FASTA file" << endl;
164 // iterate through fasta entries
167 string sequence = "";
168 while ( GetNextHeader(header) ) {
170 // ---------------------------
171 // build index entry data
174 // store file offset of beginning of DNA sequence (after header)
175 data.Offset = ftello(Stream);
177 // parse header, store sequence name in data.Name
178 if ( !GetNameFromHeader(header, data.Name) ) {
179 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
183 // retrieve FASTA sequence
184 if ( !GetNextSequence(sequence) ) {
185 cerr << "FASTA error : could not read in next sequence from FASTA file" << endl;
189 // store sequence length & line/byte lengths
190 data.Length = sequence.length();
191 data.LineLength = lineLength;
192 data.ByteLength = byteLength;
195 Index.push_back(data);
202 if ( !indexFilename.empty() ) {
203 IndexStream = fopen(indexFilename.c_str(), "wb");
204 if ( !IndexStream ) {
205 cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
212 if ( !WriteIndexData() ) return false;
219 // return succes status
223 bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
225 // make sure FASTA file is open
227 cerr << "FASTA error : file not open for reading" << endl;
231 // use index if available
232 if ( HasIndex && !Index.empty() ) {
234 // validate reference id
235 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
236 cerr << "FASTA error: invalid refId specified: " << refId << endl;
240 // retrieve reference index data
241 const FastaIndexData& referenceData = Index.at(refId);
244 if ( (position < 0) || (position > referenceData.Length) ) {
245 cerr << "FASTA error: invalid position specified: " << position << endl;
249 // calculate seek position & attempt jump
250 const int64_t lines = position / referenceData.LineLength;
251 const int64_t lineOffset = position % referenceData.LineLength;
252 const int64_t seekTo = referenceData.Offset + (lines*referenceData.ByteLength) + lineOffset;
253 if ( fseek64(Stream, seekTo, SEEK_SET) != 0 ) {
254 cerr << "FASTA error : could not seek in file" << endl;
258 // set base & return success
263 // else plow through sequentially
268 cerr << "FASTA error : could not rewind FASTA file" << endl;
272 // iterate through fasta entries
275 string sequence = "";
278 GetNextHeader(header);
279 GetNextSequence(sequence);
281 while ( currentId != refId ) {
282 GetNextHeader(header);
283 GetNextSequence(sequence);
287 // get desired base from sequence
288 // TODO: error reporting on invalid position
289 if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
290 base = sequence.at(position);
294 // could not get sequence
302 bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
304 // get rid of the leading greater than sign
305 string s = header.substr(1);
307 // extract the first non-whitespace segment
308 char* pName = (char*)s.data();
309 unsigned int nameLen = (unsigned int)s.size();
311 unsigned int start = 0;
312 while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
314 if ( start == nameLen )
318 unsigned int stop = start;
319 if ( stop < nameLen ) {
320 while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
322 if ( stop == nameLen )
327 if ( start == stop ) {
328 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
332 name = s.substr(start, stop - start).c_str();
336 bool Fasta::FastaPrivate::GetNextHeader(string& header) {
338 // validate input stream
339 if ( !IsOpen || feof(Stream) )
342 // read in header line
344 if ( fgets(buffer, 1024, Stream) == 0 ) {
345 cerr << "FASTA error : could not read from file" << endl;
349 // make sure it's a FASTA header
350 if ( buffer[0] != '>' ) {
351 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
355 // import buffer contents to header string
356 stringstream headerBuffer("");
357 headerBuffer << buffer;
358 header = headerBuffer.str();
364 bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
366 // validate input stream
367 if ( !IsOpen || feof(Stream) )
372 ostringstream seqBuffer("");
375 char ch = fgetc(Stream);
377 if( (ch == '>') || feof(Stream) )
380 if ( fgets(buffer, 1024, Stream) == 0 ) {
381 cerr << "FASTA error : could not read from file" << endl;
389 // import buffer contents to sequence string
390 sequence = seqBuffer.str();
396 bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
398 // make sure FASTA file is open
400 cerr << "FASTA error : file not open for reading" << endl;
404 // use index if available
405 if ( HasIndex && !Index.empty() ) {
407 // validate reference id
408 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
409 cerr << "FASTA error: invalid refId specified: " << refId << endl;
413 // retrieve reference index data
414 const FastaIndexData& referenceData = Index.at(refId);
416 // validate stop position
417 if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
418 cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
422 // seek to beginning of sequence data
423 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
424 cerr << "FASTA error : could not sek in file" << endl;
428 // retrieve full sequence
429 string fullSequence = "";
430 if ( !GetNextSequence(fullSequence) ) {
431 cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
435 // set sub-sequence & return success
436 const int seqLength = (stop - start) + 1;
437 sequence = fullSequence.substr(start, seqLength);
441 // else plow through sequentially
446 cerr << "FASTA error : could not rewind FASTA file" << endl;
450 // iterate through fasta entries
453 string fullSequence = "";
456 GetNextHeader(header);
457 GetNextSequence(fullSequence);
459 while ( currentId != refId ) {
460 GetNextHeader(header);
461 GetNextSequence(fullSequence);
465 // get desired substring from sequence
466 // TODO: error reporting on invalid start/stop positions
467 if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
468 const int seqLength = (stop - start) + 1;
469 sequence = fullSequence.substr(start, seqLength);
473 // could not get sequence
481 bool Fasta::FastaPrivate::LoadIndexData(void) {
483 // skip if no index file available
484 if ( !IsIndexOpen ) return false;
486 // clear any prior index data
490 stringstream indexBuffer;
493 char c = fgetc(IndexStream);
494 if ( (c == '\n') || feof(IndexStream) ) break;
495 ungetc(c, IndexStream);
497 // clear index buffer
500 // read line from index file
501 if ( fgets(buffer, 1024, IndexStream) == 0 ) {
502 cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
507 // store line in indexBuffer
508 indexBuffer << buffer;
510 // retrieve fasta index data from line
512 indexBuffer >> data.Name;
513 indexBuffer >> data.Length;
514 indexBuffer >> data.Offset;
515 indexBuffer >> data.LineLength;
516 indexBuffer >> data.ByteLength;
519 Index.push_back(data);
525 bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
529 // open FASTA filename
530 Stream = fopen(filename.c_str(), "rb");
532 cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
538 // open index file if it exists
539 if ( !indexFilename.empty() ) {
540 IndexStream = fopen(indexFilename.c_str(), "rb");
541 if ( !IndexStream ) {
542 cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
546 success &= IsIndexOpen;
548 // attempt to load index data
549 HasIndex = LoadIndexData();
553 // return success status
557 bool Fasta::FastaPrivate::Rewind(void) {
558 if ( !IsOpen ) return false;
559 return ( fseeko(Stream, 0, SEEK_SET) == 0 );
562 bool Fasta::FastaPrivate::WriteIndexData(void) {
564 // skip if no index file available
565 if ( !IsIndexOpen ) return false;
567 // iterate over index entries
569 stringstream indexBuffer;
570 vector<FastaIndexData>::const_iterator indexIter = Index.begin();
571 vector<FastaIndexData>::const_iterator indexEnd = Index.end();
572 for ( ; indexIter != indexEnd; ++indexIter ) {
577 // write data to stream
578 const FastaIndexData& data = (*indexIter);
579 indexBuffer << data.Name << "\t"
580 << data.Length << "\t"
581 << data.Offset << "\t"
582 << data.LineLength << "\t"
583 << data.ByteLength << endl;
585 // write stream to file
586 success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
589 // return success status
593 // --------------------------------
594 // Fasta implementation
597 d = new FastaPrivate;
600 Fasta::~Fasta(void) {
605 bool Fasta::Close(void) {
609 bool Fasta::CreateIndex(const string& indexFilename) {
610 return d->CreateIndex(indexFilename);
613 bool Fasta::GetBase(const int& refId, const int& position, char& base) {
614 return d->GetBase(refId, position, base);
617 bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
618 return d->GetSequence(refId, start, stop, sequence);
621 bool Fasta::Open(const string& filename, const string& indexFilename) {
622 return d->Open(filename, indexFilename);