1 // ***************************************************************************
2 // bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 13 July 2010
7 // ---------------------------------------------------------------------------
8 // Provides FASTA reading/indexing functionality.
9 // ***************************************************************************
18 #include "bamtools_fasta.h"
20 using namespace BamTools;
22 struct Fasta::FastaPrivate {
24 struct FastaIndexData {
29 int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
40 vector<FastaIndexData> Index;
46 // 'public' API methods
48 bool CreateIndex(const string& indexFilename);
49 bool GetBase(const int& refId, const int& position, char& base);
50 bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
51 bool Open(const string& filename, const string& indexFilename);
55 void Chomp(char* sequence);
56 bool GetNameFromHeader(const string& header, string& name);
57 bool GetNextHeader(string& header);
58 bool GetNextSequence(string& sequence);
59 bool LoadIndexData(void);
61 bool WriteIndexData(void);
64 Fasta::FastaPrivate::FastaPrivate(void)
70 Fasta::FastaPrivate::~FastaPrivate(void) {
74 // remove any trailing newlines
75 void Fasta::FastaPrivate::Chomp(char* sequence) {
77 static const int CHAR_LF = 10;
78 static const int CHAR_CR = 13;
80 size_t seqLength = strlen(sequence);
81 if ( seqLength == 0 ) return;
82 --seqLength; // ignore null terminator
84 while ( sequence[seqLength] == CHAR_LF ||
85 sequence[seqLength] == CHAR_CR
88 sequence[seqLength] = 0;
95 bool Fasta::FastaPrivate::Close(void) {
104 if ( HasIndex && IsIndexOpen ) {
114 bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
116 // check that file is open
118 cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
124 cerr << "FASTA error : could not rewind FASTA file" << endl;
128 // clear out prior index data
131 // -------------------------------------------
132 // calculate lineLength & byteLength
139 if ( fgets(buffer, 1024, Stream) == 0 ) {
140 cerr << "FASTA error : could not read from file" << endl;
143 if ( feof(Stream) ) return false;
144 if ( buffer[0] != '>' ) {
145 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
149 // read in first line of sequence
150 char c = fgetc(Stream);
151 while ( (c >= 0) && (c != '\n') ) {
153 if (isgraph(c)) ++lineLength;
156 ++byteLength; // store newline
160 cerr << "FASTA error : could not rewind FASTA file" << endl;
164 // iterate through fasta entries
167 string sequence = "";
168 while ( GetNextHeader(header) ) {
170 // ---------------------------
171 // build index entry data
174 // store file offset of beginning of DNA sequence (after header)
175 data.Offset = ftello(Stream);
177 // parse header, store sequence name in data.Name
178 if ( !GetNameFromHeader(header, data.Name) ) {
179 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
183 // retrieve FASTA sequence
184 if ( !GetNextSequence(sequence) ) {
185 cerr << "FASTA error : could not read in next sequence from FASTA file" << endl;
189 // store sequence length & line/byte lengths
190 data.Length = sequence.length();
191 data.LineLength = lineLength;
192 data.ByteLength = byteLength;
195 Index.push_back(data);
202 if ( !indexFilename.empty() ) {
203 IndexStream = fopen(indexFilename.c_str(), "wb");
204 if ( !IndexStream ) {
205 cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
212 if ( !WriteIndexData() ) return false;
219 // return succes status
223 bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
225 // make sure FASTA file is open
227 cerr << "FASTA error : file not open for reading" << endl;
231 // use index if available
232 if ( HasIndex && !Index.empty() ) {
234 // validate reference id
235 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
236 cerr << "FASTA error: invalid refId specified: " << refId << endl;
240 // retrieve reference index data
241 const FastaIndexData& referenceData = Index.at(refId);
244 if ( (position < 0) || (position > referenceData.Length) ) {
245 cerr << "FASTA error: invalid position specified: " << position << endl;
249 // seek to beginning of sequence data
250 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
251 cerr << "FASTA error : could not sek in file" << endl;
256 string sequence = "";
257 if ( !GetNextSequence(sequence) ) {
258 cerr << "FASTA error : could not retrieve base from FASTA file" << endl;
262 // set base & return success
263 base = sequence.at(position);
267 // else plow through sequentially
272 cerr << "FASTA error : could not rewind FASTA file" << endl;
276 // iterate through fasta entries
279 string sequence = "";
282 GetNextHeader(header);
283 GetNextSequence(sequence);
285 while ( currentId != refId ) {
286 GetNextHeader(header);
287 GetNextSequence(sequence);
291 // get desired base from sequence
292 // TODO: error reporting on invalid position
293 if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
294 base = sequence.at(position);
298 // could not get sequence
306 bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
308 // get rid of the leading greater than sign
309 string s = header.substr(1);
311 // extract the first non-whitespace segment
312 char* pName = (char*)s.data();
313 unsigned int nameLen = (unsigned int)s.size();
315 unsigned int start = 0;
316 while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
318 if ( start == nameLen )
322 unsigned int stop = start;
323 if ( stop < nameLen ) {
324 while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
326 if ( stop == nameLen )
331 if ( start == stop ) {
332 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
336 name = s.substr(start, stop - start).c_str();
340 bool Fasta::FastaPrivate::GetNextHeader(string& header) {
342 // validate input stream
343 if ( !IsOpen || feof(Stream) )
346 // read in header line
348 if ( fgets(buffer, 1024, Stream) == 0 ) {
349 cerr << "FASTA error : could not read from file" << endl;
353 // make sure it's a FASTA header
354 if ( buffer[0] != '>' ) {
355 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
359 // import buffer contents to header string
360 stringstream headerBuffer("");
361 headerBuffer << buffer;
362 header = headerBuffer.str();
368 bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
370 // validate input stream
371 if ( !IsOpen || feof(Stream) )
376 ostringstream seqBuffer("");
379 char ch = fgetc(Stream);
381 if( (ch == '>') || feof(Stream) )
384 if ( fgets(buffer, 1024, Stream) == 0 ) {
385 cerr << "FASTA error : could not read from file" << endl;
393 // import buffer contents to sequence string
394 sequence = seqBuffer.str();
400 bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
402 // make sure FASTA file is open
404 cerr << "FASTA error : file not open for reading" << endl;
408 // use index if available
409 if ( HasIndex && !Index.empty() ) {
411 // validate reference id
412 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
413 cerr << "FASTA error: invalid refId specified: " << refId << endl;
417 // retrieve reference index data
418 const FastaIndexData& referenceData = Index.at(refId);
420 // validate stop position
421 if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
422 cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
426 // seek to beginning of sequence data
427 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
428 cerr << "FASTA error : could not sek in file" << endl;
432 // retrieve full sequence
433 string fullSequence = "";
434 if ( !GetNextSequence(fullSequence) ) {
435 cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
439 // set sub-sequence & return success
440 const int seqLength = (stop - start) + 1;
441 sequence = fullSequence.substr(start, seqLength);
445 // else plow through sequentially
450 cerr << "FASTA error : could not rewind FASTA file" << endl;
454 // iterate through fasta entries
457 string fullSequence = "";
460 GetNextHeader(header);
461 GetNextSequence(fullSequence);
463 while ( currentId != refId ) {
464 GetNextHeader(header);
465 GetNextSequence(fullSequence);
469 // get desired substring from sequence
470 // TODO: error reporting on invalid start/stop positions
471 if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
472 const int seqLength = (stop - start) + 1;
473 sequence = fullSequence.substr(start, seqLength);
477 // could not get sequence
485 bool Fasta::FastaPrivate::LoadIndexData(void) {
487 // skip if no index file available
488 if ( !IsIndexOpen ) return false;
490 // clear any prior index data
494 stringstream indexBuffer;
497 char c = fgetc(IndexStream);
498 if ( (c == '\n') || feof(IndexStream) ) break;
499 ungetc(c, IndexStream);
501 // clear index buffer
504 // read line from index file
505 if ( fgets(buffer, 1024, IndexStream) == 0 ) {
506 cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
511 // store line in indexBuffer
512 indexBuffer << buffer;
514 // retrieve fasta index data from line
516 indexBuffer >> data.Name;
517 indexBuffer >> data.Length;
518 indexBuffer >> data.Offset;
519 indexBuffer >> data.LineLength;
520 indexBuffer >> data.ByteLength;
523 Index.push_back(data);
529 bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
533 // open FASTA filename
534 Stream = fopen(filename.c_str(), "rb");
536 cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
542 // open index file if it exists
543 if ( !indexFilename.empty() ) {
544 IndexStream = fopen(indexFilename.c_str(), "rb");
545 if ( !IndexStream ) {
546 cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
550 success &= IsIndexOpen;
552 // attempt to load index data
553 HasIndex = LoadIndexData();
557 // return success status
561 bool Fasta::FastaPrivate::Rewind(void) {
562 if ( !IsOpen ) return false;
563 return ( fseeko(Stream, 0, SEEK_SET) == 0 );
566 bool Fasta::FastaPrivate::WriteIndexData(void) {
568 // skip if no index file available
569 if ( !IsIndexOpen ) return false;
571 // iterate over index entries
573 stringstream indexBuffer;
574 vector<FastaIndexData>::const_iterator indexIter = Index.begin();
575 vector<FastaIndexData>::const_iterator indexEnd = Index.end();
576 for ( ; indexIter != indexEnd; ++indexIter ) {
581 // write data to stream
582 const FastaIndexData& data = (*indexIter);
583 indexBuffer << data.Name << "\t"
584 << data.Length << "\t"
585 << data.Offset << "\t"
586 << data.LineLength << "\t"
587 << data.ByteLength << endl;
589 // write stream to file
590 success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
593 // return success status
597 // --------------------------------
598 // Fasta implementation
601 d = new FastaPrivate;
604 Fasta::~Fasta(void) {
609 bool Fasta::Close(void) {
613 bool Fasta::CreateIndex(const string& indexFilename) {
614 return d->CreateIndex(indexFilename);
617 bool Fasta::GetBase(const int& refId, const int& position, char& base) {
618 return d->GetBase(refId, position, base);
621 bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
622 return d->GetSequence(refId, start, stop, sequence);
625 bool Fasta::Open(const string& filename, const string& indexFilename) {
626 return d->Open(filename, indexFilename);