1 // ***************************************************************************
2 // bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 13 July 2010
7 // ---------------------------------------------------------------------------
8 // Provides FASTA reading/indexing functionality.
9 // ***************************************************************************
11 #include <utils/bamtools_fasta.h>
12 using namespace BamTools;
24 #define ftello _ftelli64
25 #define fseeko _fseeki64
28 struct Fasta::FastaPrivate {
30 struct FastaIndexData {
35 int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
46 vector<FastaIndexData> Index;
52 // 'public' API methods
54 bool CreateIndex(const string& indexFilename);
55 bool GetBase(const int& refId, const int& position, char& base);
56 bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
57 bool Open(const string& filename, const string& indexFilename);
61 void Chomp(char* sequence);
62 bool GetNameFromHeader(const string& header, string& name);
63 bool GetNextHeader(string& header);
64 bool GetNextSequence(string& sequence);
65 bool LoadIndexData(void);
67 bool WriteIndexData(void);
70 Fasta::FastaPrivate::FastaPrivate(void)
76 Fasta::FastaPrivate::~FastaPrivate(void) {
80 // remove any trailing newlines
81 void Fasta::FastaPrivate::Chomp(char* sequence) {
83 static const int CHAR_LF = 10;
84 static const int CHAR_CR = 13;
86 size_t seqLength = strlen(sequence);
87 if ( seqLength == 0 ) return;
88 --seqLength; // ignore null terminator
90 while ( sequence[seqLength] == CHAR_LF ||
91 sequence[seqLength] == CHAR_CR
94 sequence[seqLength] = 0;
101 bool Fasta::FastaPrivate::Close(void) {
110 if ( HasIndex && IsIndexOpen ) {
120 bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
122 // check that file is open
124 cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
130 cerr << "FASTA error : could not rewind FASTA file" << endl;
134 // clear out prior index data
137 // -------------------------------------------
138 // calculate lineLength & byteLength
145 if ( fgets(buffer, 1024, Stream) == 0 ) {
146 cerr << "FASTA error : could not read from file" << endl;
149 if ( feof(Stream) ) return false;
150 if ( buffer[0] != '>' ) {
151 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
155 // read in first line of sequence
156 char c = fgetc(Stream);
157 while ( (c >= 0) && (c != '\n') ) {
159 if (isgraph(c)) ++lineLength;
162 ++byteLength; // store newline
166 cerr << "FASTA error : could not rewind FASTA file" << endl;
170 // iterate through fasta entries
173 string sequence = "";
174 while ( GetNextHeader(header) ) {
176 // ---------------------------
177 // build index entry data
180 // store file offset of beginning of DNA sequence (after header)
181 data.Offset = ftello(Stream);
183 // parse header, store sequence name in data.Name
184 if ( !GetNameFromHeader(header, data.Name) ) {
185 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
189 // retrieve FASTA sequence
190 if ( !GetNextSequence(sequence) ) {
191 cerr << "FASTA error : could not read in next sequence from FASTA file" << endl;
195 // store sequence length & line/byte lengths
196 data.Length = sequence.length();
197 data.LineLength = lineLength;
198 data.ByteLength = byteLength;
201 Index.push_back(data);
208 if ( !indexFilename.empty() ) {
209 IndexStream = fopen(indexFilename.c_str(), "wb");
210 if ( !IndexStream ) {
211 cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
218 if ( !WriteIndexData() ) return false;
225 // return succes status
229 bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
231 // make sure FASTA file is open
233 cerr << "FASTA error : file not open for reading" << endl;
237 // use index if available
238 if ( HasIndex && !Index.empty() ) {
240 // validate reference id
241 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
242 cerr << "FASTA error: invalid refId specified: " << refId << endl;
246 // retrieve reference index data
247 const FastaIndexData& referenceData = Index.at(refId);
250 if ( (position < 0) || (position > referenceData.Length) ) {
251 cerr << "FASTA error: invalid position specified: " << position << endl;
255 // seek to beginning of sequence data
256 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
257 cerr << "FASTA error : could not sek in file" << endl;
262 string sequence = "";
263 if ( !GetNextSequence(sequence) ) {
264 cerr << "FASTA error : could not retrieve base from FASTA file" << endl;
268 // set base & return success
269 base = sequence.at(position);
273 // else plow through sequentially
278 cerr << "FASTA error : could not rewind FASTA file" << endl;
282 // iterate through fasta entries
285 string sequence = "";
288 GetNextHeader(header);
289 GetNextSequence(sequence);
291 while ( currentId != refId ) {
292 GetNextHeader(header);
293 GetNextSequence(sequence);
297 // get desired base from sequence
298 // TODO: error reporting on invalid position
299 if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
300 base = sequence.at(position);
304 // could not get sequence
312 bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
314 // get rid of the leading greater than sign
315 string s = header.substr(1);
317 // extract the first non-whitespace segment
318 char* pName = (char*)s.data();
319 unsigned int nameLen = (unsigned int)s.size();
321 unsigned int start = 0;
322 while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
324 if ( start == nameLen )
328 unsigned int stop = start;
329 if ( stop < nameLen ) {
330 while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
332 if ( stop == nameLen )
337 if ( start == stop ) {
338 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
342 name = s.substr(start, stop - start).c_str();
346 bool Fasta::FastaPrivate::GetNextHeader(string& header) {
348 // validate input stream
349 if ( !IsOpen || feof(Stream) )
352 // read in header line
354 if ( fgets(buffer, 1024, Stream) == 0 ) {
355 cerr << "FASTA error : could not read from file" << endl;
359 // make sure it's a FASTA header
360 if ( buffer[0] != '>' ) {
361 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
365 // import buffer contents to header string
366 stringstream headerBuffer("");
367 headerBuffer << buffer;
368 header = headerBuffer.str();
374 bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
376 // validate input stream
377 if ( !IsOpen || feof(Stream) )
382 ostringstream seqBuffer("");
385 char ch = fgetc(Stream);
387 if( (ch == '>') || feof(Stream) )
390 if ( fgets(buffer, 1024, Stream) == 0 ) {
391 cerr << "FASTA error : could not read from file" << endl;
399 // import buffer contents to sequence string
400 sequence = seqBuffer.str();
406 bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
408 // make sure FASTA file is open
410 cerr << "FASTA error : file not open for reading" << endl;
414 // use index if available
415 if ( HasIndex && !Index.empty() ) {
417 // validate reference id
418 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
419 cerr << "FASTA error: invalid refId specified: " << refId << endl;
423 // retrieve reference index data
424 const FastaIndexData& referenceData = Index.at(refId);
426 // validate stop position
427 if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
428 cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
432 // seek to beginning of sequence data
433 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
434 cerr << "FASTA error : could not sek in file" << endl;
438 // retrieve full sequence
439 string fullSequence = "";
440 if ( !GetNextSequence(fullSequence) ) {
441 cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
445 // set sub-sequence & return success
446 const int seqLength = (stop - start) + 1;
447 sequence = fullSequence.substr(start, seqLength);
451 // else plow through sequentially
456 cerr << "FASTA error : could not rewind FASTA file" << endl;
460 // iterate through fasta entries
463 string fullSequence = "";
466 GetNextHeader(header);
467 GetNextSequence(fullSequence);
469 while ( currentId != refId ) {
470 GetNextHeader(header);
471 GetNextSequence(fullSequence);
475 // get desired substring from sequence
476 // TODO: error reporting on invalid start/stop positions
477 if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
478 const int seqLength = (stop - start) + 1;
479 sequence = fullSequence.substr(start, seqLength);
483 // could not get sequence
491 bool Fasta::FastaPrivate::LoadIndexData(void) {
493 // skip if no index file available
494 if ( !IsIndexOpen ) return false;
496 // clear any prior index data
500 stringstream indexBuffer;
503 char c = fgetc(IndexStream);
504 if ( (c == '\n') || feof(IndexStream) ) break;
505 ungetc(c, IndexStream);
507 // clear index buffer
510 // read line from index file
511 if ( fgets(buffer, 1024, IndexStream) == 0 ) {
512 cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
517 // store line in indexBuffer
518 indexBuffer << buffer;
520 // retrieve fasta index data from line
522 indexBuffer >> data.Name;
523 indexBuffer >> data.Length;
524 indexBuffer >> data.Offset;
525 indexBuffer >> data.LineLength;
526 indexBuffer >> data.ByteLength;
529 Index.push_back(data);
535 bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
539 // open FASTA filename
540 Stream = fopen(filename.c_str(), "rb");
542 cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
548 // open index file if it exists
549 if ( !indexFilename.empty() ) {
550 IndexStream = fopen(indexFilename.c_str(), "rb");
551 if ( !IndexStream ) {
552 cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
556 success &= IsIndexOpen;
558 // attempt to load index data
559 HasIndex = LoadIndexData();
563 // return success status
567 bool Fasta::FastaPrivate::Rewind(void) {
568 if ( !IsOpen ) return false;
569 return ( fseeko(Stream, 0, SEEK_SET) == 0 );
572 bool Fasta::FastaPrivate::WriteIndexData(void) {
574 // skip if no index file available
575 if ( !IsIndexOpen ) return false;
577 // iterate over index entries
579 stringstream indexBuffer;
580 vector<FastaIndexData>::const_iterator indexIter = Index.begin();
581 vector<FastaIndexData>::const_iterator indexEnd = Index.end();
582 for ( ; indexIter != indexEnd; ++indexIter ) {
587 // write data to stream
588 const FastaIndexData& data = (*indexIter);
589 indexBuffer << data.Name << "\t"
590 << data.Length << "\t"
591 << data.Offset << "\t"
592 << data.LineLength << "\t"
593 << data.ByteLength << endl;
595 // write stream to file
596 success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
599 // return success status
603 // --------------------------------
604 // Fasta implementation
607 d = new FastaPrivate;
610 Fasta::~Fasta(void) {
615 bool Fasta::Close(void) {
619 bool Fasta::CreateIndex(const string& indexFilename) {
620 return d->CreateIndex(indexFilename);
623 bool Fasta::GetBase(const int& refId, const int& position, char& base) {
624 return d->GetBase(refId, position, base);
627 bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
628 return d->GetSequence(refId, start, stop, sequence);
631 bool Fasta::Open(const string& filename, const string& indexFilename) {
632 return d->Open(filename, indexFilename);