1 // ***************************************************************************
2 // bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 10 October 2011
6 // ---------------------------------------------------------------------------
7 // Provides FASTA reading/indexing functionality.
8 // ***************************************************************************
10 #include "utils/bamtools_fasta.h"
11 using namespace BamTools;
23 #define ftello _ftelli64
24 #define fseeko _fseeki64
27 struct Fasta::FastaPrivate {
29 struct FastaIndexData {
34 int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
45 vector<FastaIndexData> Index;
51 // 'public' API methods
53 bool CreateIndex(const string& indexFilename);
54 bool GetBase(const int& refId, const int& position, char& base);
55 bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
56 bool Open(const string& filename, const string& indexFilename);
60 void Chomp(char* sequence);
61 bool GetNameFromHeader(const string& header, string& name);
62 bool GetNextHeader(string& header);
63 bool GetNextSequence(string& sequence);
64 bool LoadIndexData(void);
66 bool WriteIndexData(void);
69 Fasta::FastaPrivate::FastaPrivate(void)
75 Fasta::FastaPrivate::~FastaPrivate(void) {
79 // remove any trailing newlines
80 void Fasta::FastaPrivate::Chomp(char* sequence) {
82 static const int CHAR_LF = 10;
83 static const int CHAR_CR = 13;
85 size_t seqLength = strlen(sequence);
86 if ( seqLength == 0 ) return;
87 --seqLength; // ignore null terminator
89 while ( sequence[seqLength] == CHAR_LF ||
90 sequence[seqLength] == CHAR_CR
93 sequence[seqLength] = 0;
100 bool Fasta::FastaPrivate::Close(void) {
109 if ( HasIndex && IsIndexOpen ) {
119 bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
121 // check that file is open
123 cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
129 cerr << "FASTA error : could not rewind FASTA file" << endl;
133 // clear out prior index data
136 // -------------------------------------------
137 // calculate lineLength & byteLength
144 if ( fgets(buffer, 1024, Stream) == 0 ) {
145 cerr << "FASTA error : could not read from file" << endl;
148 if ( feof(Stream) ) return false;
149 if ( buffer[0] != '>' ) {
150 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
154 // read in first line of sequence
155 char c = fgetc(Stream);
156 while ( (c >= 0) && (c != '\n') ) {
158 if (isgraph(c)) ++lineLength;
161 ++byteLength; // store newline
165 cerr << "FASTA error : could not rewind FASTA file" << endl;
169 // iterate through fasta entries
172 string sequence = "";
173 while ( GetNextHeader(header) ) {
175 // ---------------------------
176 // build index entry data
179 // store file offset of beginning of DNA sequence (after header)
180 data.Offset = ftello(Stream);
182 // parse header, store sequence name in data.Name
183 if ( !GetNameFromHeader(header, data.Name) ) {
184 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
188 // retrieve FASTA sequence
189 if ( !GetNextSequence(sequence) ) {
190 cerr << "FASTA error : could not read in next sequence from FASTA file" << endl;
194 // store sequence length & line/byte lengths
195 data.Length = sequence.length();
196 data.LineLength = lineLength;
197 data.ByteLength = byteLength;
200 Index.push_back(data);
207 if ( !indexFilename.empty() ) {
208 IndexStream = fopen(indexFilename.c_str(), "wb");
209 if ( !IndexStream ) {
210 cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
217 if ( !WriteIndexData() ) return false;
224 // return succes status
228 bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
230 // make sure FASTA file is open
232 cerr << "FASTA error : file not open for reading" << endl;
236 // use index if available
237 if ( HasIndex && !Index.empty() ) {
239 // validate reference id
240 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
241 cerr << "FASTA error: invalid refId specified: " << refId << endl;
245 // retrieve reference index data
246 const FastaIndexData& referenceData = Index.at(refId);
249 if ( (position < 0) || (position > referenceData.Length) ) {
250 cerr << "FASTA error: invalid position specified: " << position << endl;
254 // seek to beginning of sequence data
255 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
256 cerr << "FASTA error : could not sek in file" << endl;
261 string sequence = "";
262 if ( !GetNextSequence(sequence) ) {
263 cerr << "FASTA error : could not retrieve base from FASTA file" << endl;
267 // set base & return success
268 base = sequence.at(position);
272 // else plow through sequentially
277 cerr << "FASTA error : could not rewind FASTA file" << endl;
281 // iterate through fasta entries
284 string sequence = "";
287 GetNextHeader(header);
288 GetNextSequence(sequence);
290 while ( currentId != refId ) {
291 GetNextHeader(header);
292 GetNextSequence(sequence);
296 // get desired base from sequence
297 // TODO: error reporting on invalid position
298 if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
299 base = sequence.at(position);
303 // could not get sequence
311 bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
313 // get rid of the leading greater than sign
314 string s = header.substr(1);
316 // extract the first non-whitespace segment
317 char* pName = (char*)s.data();
318 unsigned int nameLen = (unsigned int)s.size();
320 unsigned int start = 0;
321 while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
323 if ( start == nameLen )
327 unsigned int stop = start;
328 if ( stop < nameLen ) {
329 while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
331 if ( stop == nameLen )
336 if ( start == stop ) {
337 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
341 name = s.substr(start, stop - start).c_str();
345 bool Fasta::FastaPrivate::GetNextHeader(string& header) {
347 // validate input stream
348 if ( !IsOpen || feof(Stream) )
351 // read in header line
353 if ( fgets(buffer, 1024, Stream) == 0 ) {
354 cerr << "FASTA error : could not read from file" << endl;
358 // make sure it's a FASTA header
359 if ( buffer[0] != '>' ) {
360 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
364 // import buffer contents to header string
365 stringstream headerBuffer("");
366 headerBuffer << buffer;
367 header = headerBuffer.str();
373 bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
375 // validate input stream
376 if ( !IsOpen || feof(Stream) )
381 ostringstream seqBuffer("");
384 char ch = fgetc(Stream);
386 if( (ch == '>') || feof(Stream) )
389 if ( fgets(buffer, 1024, Stream) == 0 ) {
390 cerr << "FASTA error : could not read from file" << endl;
398 // import buffer contents to sequence string
399 sequence = seqBuffer.str();
405 bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
407 // make sure FASTA file is open
409 cerr << "FASTA error : file not open for reading" << endl;
413 // use index if available
414 if ( HasIndex && !Index.empty() ) {
416 // validate reference id
417 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
418 cerr << "FASTA error: invalid refId specified: " << refId << endl;
422 // retrieve reference index data
423 const FastaIndexData& referenceData = Index.at(refId);
425 // validate stop position
426 if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
427 cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
431 // seek to beginning of sequence data
432 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
433 cerr << "FASTA error : could not sek in file" << endl;
437 // retrieve full sequence
438 string fullSequence = "";
439 if ( !GetNextSequence(fullSequence) ) {
440 cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
444 // set sub-sequence & return success
445 const int seqLength = (stop - start) + 1;
446 sequence = fullSequence.substr(start, seqLength);
450 // else plow through sequentially
455 cerr << "FASTA error : could not rewind FASTA file" << endl;
459 // iterate through fasta entries
462 string fullSequence = "";
465 GetNextHeader(header);
466 GetNextSequence(fullSequence);
468 while ( currentId != refId ) {
469 GetNextHeader(header);
470 GetNextSequence(fullSequence);
474 // get desired substring from sequence
475 // TODO: error reporting on invalid start/stop positions
476 if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
477 const int seqLength = (stop - start) + 1;
478 sequence = fullSequence.substr(start, seqLength);
482 // could not get sequence
490 bool Fasta::FastaPrivate::LoadIndexData(void) {
492 // skip if no index file available
493 if ( !IsIndexOpen ) return false;
495 // clear any prior index data
499 stringstream indexBuffer;
502 char c = fgetc(IndexStream);
503 if ( (c == '\n') || feof(IndexStream) ) break;
504 ungetc(c, IndexStream);
506 // clear index buffer
509 // read line from index file
510 if ( fgets(buffer, 1024, IndexStream) == 0 ) {
511 cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
516 // store line in indexBuffer
517 indexBuffer << buffer;
519 // retrieve fasta index data from line
521 indexBuffer >> data.Name;
522 indexBuffer >> data.Length;
523 indexBuffer >> data.Offset;
524 indexBuffer >> data.LineLength;
525 indexBuffer >> data.ByteLength;
528 Index.push_back(data);
534 bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
538 // open FASTA filename
539 Stream = fopen(filename.c_str(), "rb");
541 cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
547 // open index file if it exists
548 if ( !indexFilename.empty() ) {
549 IndexStream = fopen(indexFilename.c_str(), "rb");
550 if ( !IndexStream ) {
551 cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
555 success &= IsIndexOpen;
557 // attempt to load index data
558 HasIndex = LoadIndexData();
562 // return success status
566 bool Fasta::FastaPrivate::Rewind(void) {
567 if ( !IsOpen ) return false;
568 return ( fseeko(Stream, 0, SEEK_SET) == 0 );
571 bool Fasta::FastaPrivate::WriteIndexData(void) {
573 // skip if no index file available
574 if ( !IsIndexOpen ) return false;
576 // iterate over index entries
578 stringstream indexBuffer;
579 vector<FastaIndexData>::const_iterator indexIter = Index.begin();
580 vector<FastaIndexData>::const_iterator indexEnd = Index.end();
581 for ( ; indexIter != indexEnd; ++indexIter ) {
586 // write data to stream
587 const FastaIndexData& data = (*indexIter);
588 indexBuffer << data.Name << "\t"
589 << data.Length << "\t"
590 << data.Offset << "\t"
591 << data.LineLength << "\t"
592 << data.ByteLength << endl;
594 // write stream to file
595 success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
598 // return success status
602 // --------------------------------
603 // Fasta implementation
606 d = new FastaPrivate;
609 Fasta::~Fasta(void) {
614 bool Fasta::Close(void) {
618 bool Fasta::CreateIndex(const string& indexFilename) {
619 return d->CreateIndex(indexFilename);
622 bool Fasta::GetBase(const int& refId, const int& position, char& base) {
623 return d->GetBase(refId, position, base);
626 bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
627 return d->GetSequence(refId, start, stop, sequence);
630 bool Fasta::Open(const string& filename, const string& indexFilename) {
631 return d->Open(filename, indexFilename);