1 // ***************************************************************************
2 // bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 13 July 2010
7 // ---------------------------------------------------------------------------
8 // Provides FASTA reading/indexing functionality.
9 // ***************************************************************************
11 #include <utils/bamtools_fasta.h>
12 using namespace BamTools;
23 struct Fasta::FastaPrivate {
25 struct FastaIndexData {
30 int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
41 vector<FastaIndexData> Index;
47 // 'public' API methods
49 bool CreateIndex(const string& indexFilename);
50 bool GetBase(const int& refId, const int& position, char& base);
51 bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
52 bool Open(const string& filename, const string& indexFilename);
56 void Chomp(char* sequence);
57 bool GetNameFromHeader(const string& header, string& name);
58 bool GetNextHeader(string& header);
59 bool GetNextSequence(string& sequence);
60 bool LoadIndexData(void);
62 bool WriteIndexData(void);
65 Fasta::FastaPrivate::FastaPrivate(void)
71 Fasta::FastaPrivate::~FastaPrivate(void) {
75 // remove any trailing newlines
76 void Fasta::FastaPrivate::Chomp(char* sequence) {
78 static const int CHAR_LF = 10;
79 static const int CHAR_CR = 13;
81 size_t seqLength = strlen(sequence);
82 if ( seqLength == 0 ) return;
83 --seqLength; // ignore null terminator
85 while ( sequence[seqLength] == CHAR_LF ||
86 sequence[seqLength] == CHAR_CR
89 sequence[seqLength] = 0;
96 bool Fasta::FastaPrivate::Close(void) {
105 if ( HasIndex && IsIndexOpen ) {
115 bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
117 // check that file is open
119 cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
125 cerr << "FASTA error : could not rewind FASTA file" << endl;
129 // clear out prior index data
132 // -------------------------------------------
133 // calculate lineLength & byteLength
140 if ( fgets(buffer, 1024, Stream) == 0 ) {
141 cerr << "FASTA error : could not read from file" << endl;
144 if ( feof(Stream) ) return false;
145 if ( buffer[0] != '>' ) {
146 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
150 // read in first line of sequence
151 char c = fgetc(Stream);
152 while ( (c >= 0) && (c != '\n') ) {
154 if (isgraph(c)) ++lineLength;
157 ++byteLength; // store newline
161 cerr << "FASTA error : could not rewind FASTA file" << endl;
165 // iterate through fasta entries
168 string sequence = "";
169 while ( GetNextHeader(header) ) {
171 // ---------------------------
172 // build index entry data
175 // store file offset of beginning of DNA sequence (after header)
176 data.Offset = ftello(Stream);
178 // parse header, store sequence name in data.Name
179 if ( !GetNameFromHeader(header, data.Name) ) {
180 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
184 // retrieve FASTA sequence
185 if ( !GetNextSequence(sequence) ) {
186 cerr << "FASTA error : could not read in next sequence from FASTA file" << endl;
190 // store sequence length & line/byte lengths
191 data.Length = sequence.length();
192 data.LineLength = lineLength;
193 data.ByteLength = byteLength;
196 Index.push_back(data);
203 if ( !indexFilename.empty() ) {
204 IndexStream = fopen(indexFilename.c_str(), "wb");
205 if ( !IndexStream ) {
206 cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
213 if ( !WriteIndexData() ) return false;
220 // return succes status
224 bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
226 // make sure FASTA file is open
228 cerr << "FASTA error : file not open for reading" << endl;
232 // use index if available
233 if ( HasIndex && !Index.empty() ) {
235 // validate reference id
236 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
237 cerr << "FASTA error: invalid refId specified: " << refId << endl;
241 // retrieve reference index data
242 const FastaIndexData& referenceData = Index.at(refId);
245 if ( (position < 0) || (position > referenceData.Length) ) {
246 cerr << "FASTA error: invalid position specified: " << position << endl;
250 // seek to beginning of sequence data
251 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
252 cerr << "FASTA error : could not sek in file" << endl;
257 string sequence = "";
258 if ( !GetNextSequence(sequence) ) {
259 cerr << "FASTA error : could not retrieve base from FASTA file" << endl;
263 // set base & return success
264 base = sequence.at(position);
268 // else plow through sequentially
273 cerr << "FASTA error : could not rewind FASTA file" << endl;
277 // iterate through fasta entries
280 string sequence = "";
283 GetNextHeader(header);
284 GetNextSequence(sequence);
286 while ( currentId != refId ) {
287 GetNextHeader(header);
288 GetNextSequence(sequence);
292 // get desired base from sequence
293 // TODO: error reporting on invalid position
294 if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
295 base = sequence.at(position);
299 // could not get sequence
307 bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
309 // get rid of the leading greater than sign
310 string s = header.substr(1);
312 // extract the first non-whitespace segment
313 char* pName = (char*)s.data();
314 unsigned int nameLen = (unsigned int)s.size();
316 unsigned int start = 0;
317 while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
319 if ( start == nameLen )
323 unsigned int stop = start;
324 if ( stop < nameLen ) {
325 while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
327 if ( stop == nameLen )
332 if ( start == stop ) {
333 cerr << "FASTA error : could not parse read name from FASTA header" << endl;
337 name = s.substr(start, stop - start).c_str();
341 bool Fasta::FastaPrivate::GetNextHeader(string& header) {
343 // validate input stream
344 if ( !IsOpen || feof(Stream) )
347 // read in header line
349 if ( fgets(buffer, 1024, Stream) == 0 ) {
350 cerr << "FASTA error : could not read from file" << endl;
354 // make sure it's a FASTA header
355 if ( buffer[0] != '>' ) {
356 cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
360 // import buffer contents to header string
361 stringstream headerBuffer("");
362 headerBuffer << buffer;
363 header = headerBuffer.str();
369 bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
371 // validate input stream
372 if ( !IsOpen || feof(Stream) )
377 ostringstream seqBuffer("");
380 char ch = fgetc(Stream);
382 if( (ch == '>') || feof(Stream) )
385 if ( fgets(buffer, 1024, Stream) == 0 ) {
386 cerr << "FASTA error : could not read from file" << endl;
394 // import buffer contents to sequence string
395 sequence = seqBuffer.str();
401 bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
403 // make sure FASTA file is open
405 cerr << "FASTA error : file not open for reading" << endl;
409 // use index if available
410 if ( HasIndex && !Index.empty() ) {
412 // validate reference id
413 if ( (refId < 0) || (refId >= (int)Index.size()) ) {
414 cerr << "FASTA error: invalid refId specified: " << refId << endl;
418 // retrieve reference index data
419 const FastaIndexData& referenceData = Index.at(refId);
421 // validate stop position
422 if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
423 cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
427 // seek to beginning of sequence data
428 if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
429 cerr << "FASTA error : could not sek in file" << endl;
433 // retrieve full sequence
434 string fullSequence = "";
435 if ( !GetNextSequence(fullSequence) ) {
436 cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
440 // set sub-sequence & return success
441 const int seqLength = (stop - start) + 1;
442 sequence = fullSequence.substr(start, seqLength);
446 // else plow through sequentially
451 cerr << "FASTA error : could not rewind FASTA file" << endl;
455 // iterate through fasta entries
458 string fullSequence = "";
461 GetNextHeader(header);
462 GetNextSequence(fullSequence);
464 while ( currentId != refId ) {
465 GetNextHeader(header);
466 GetNextSequence(fullSequence);
470 // get desired substring from sequence
471 // TODO: error reporting on invalid start/stop positions
472 if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
473 const int seqLength = (stop - start) + 1;
474 sequence = fullSequence.substr(start, seqLength);
478 // could not get sequence
486 bool Fasta::FastaPrivate::LoadIndexData(void) {
488 // skip if no index file available
489 if ( !IsIndexOpen ) return false;
491 // clear any prior index data
495 stringstream indexBuffer;
498 char c = fgetc(IndexStream);
499 if ( (c == '\n') || feof(IndexStream) ) break;
500 ungetc(c, IndexStream);
502 // clear index buffer
505 // read line from index file
506 if ( fgets(buffer, 1024, IndexStream) == 0 ) {
507 cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
512 // store line in indexBuffer
513 indexBuffer << buffer;
515 // retrieve fasta index data from line
517 indexBuffer >> data.Name;
518 indexBuffer >> data.Length;
519 indexBuffer >> data.Offset;
520 indexBuffer >> data.LineLength;
521 indexBuffer >> data.ByteLength;
524 Index.push_back(data);
530 bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
534 // open FASTA filename
535 Stream = fopen(filename.c_str(), "rb");
537 cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
543 // open index file if it exists
544 if ( !indexFilename.empty() ) {
545 IndexStream = fopen(indexFilename.c_str(), "rb");
546 if ( !IndexStream ) {
547 cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
551 success &= IsIndexOpen;
553 // attempt to load index data
554 HasIndex = LoadIndexData();
558 // return success status
562 bool Fasta::FastaPrivate::Rewind(void) {
563 if ( !IsOpen ) return false;
564 return ( fseeko(Stream, 0, SEEK_SET) == 0 );
567 bool Fasta::FastaPrivate::WriteIndexData(void) {
569 // skip if no index file available
570 if ( !IsIndexOpen ) return false;
572 // iterate over index entries
574 stringstream indexBuffer;
575 vector<FastaIndexData>::const_iterator indexIter = Index.begin();
576 vector<FastaIndexData>::const_iterator indexEnd = Index.end();
577 for ( ; indexIter != indexEnd; ++indexIter ) {
582 // write data to stream
583 const FastaIndexData& data = (*indexIter);
584 indexBuffer << data.Name << "\t"
585 << data.Length << "\t"
586 << data.Offset << "\t"
587 << data.LineLength << "\t"
588 << data.ByteLength << endl;
590 // write stream to file
591 success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
594 // return success status
598 // --------------------------------
599 // Fasta implementation
602 d = new FastaPrivate;
605 Fasta::~Fasta(void) {
610 bool Fasta::Close(void) {
614 bool Fasta::CreateIndex(const string& indexFilename) {
615 return d->CreateIndex(indexFilename);
618 bool Fasta::GetBase(const int& refId, const int& position, char& base) {
619 return d->GetBase(refId, position, base);
622 bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
623 return d->GetSequence(refId, start, stop, sequence);
626 bool Fasta::Open(const string& filename, const string& indexFilename) {
627 return d->Open(filename, indexFilename);