5 * Created by Pat Schloss on 12/15/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
10 #include "sequence.hpp"
12 /***********************************************************************/
14 m = MothurOut::getInstance();
17 /***********************************************************************/
18 Sequence::Sequence(string newName, string sequence) {
20 m = MothurOut::getInstance();
24 for (int i = 0; i < name.length(); i++) {
25 if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
28 //setUnaligned removes any gap characters for us
29 setUnaligned(sequence);
33 m->errorOut(e, "Sequence", "Sequence");
37 /***********************************************************************/
38 Sequence::Sequence(string newName, string sequence, string justUnAligned) {
40 m = MothurOut::getInstance();
44 for (int i = 0; i < name.length(); i++) {
45 if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
48 //setUnaligned removes any gap characters for us
49 setUnaligned(sequence);
52 m->errorOut(e, "Sequence", "Sequence");
57 //********************************************************************************************************************
58 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
59 Sequence::Sequence(istringstream& fastaString){
61 m = MothurOut::getInstance();
64 name = getSequenceName(fastaString);
66 if (!m->control_pressed) {
70 while ((name[0] == '#') && fastaString) {
71 while (!fastaString.eof()) { char c = fastaString.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
72 sequence = getCommentString(fastaString);
76 name = name.substr(1);
83 while (!fastaString.eof()) { char c = fastaString.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
86 sequence = getSequenceString(fastaString, numAmbig);
89 //setUnaligned removes any gap characters for us
90 setUnaligned(sequence);
92 if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
97 m->errorOut(e, "Sequence", "Sequence");
101 //********************************************************************************************************************
102 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
103 Sequence::Sequence(istringstream& fastaString, string JustUnaligned){
105 m = MothurOut::getInstance();
108 name = getSequenceName(fastaString);
110 if (!m->control_pressed) {
114 while ((name[0] == '#') && fastaString) {
115 while (!fastaString.eof()) { char c = fastaString.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
116 sequence = getCommentString(fastaString);
120 name = name.substr(1);
127 while (!fastaString.eof()) { char c = fastaString.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
130 sequence = getSequenceString(fastaString, numAmbig);
132 //setUnaligned removes any gap characters for us
133 setUnaligned(sequence);
135 if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
140 catch(exception& e) {
141 m->errorOut(e, "Sequence", "Sequence");
147 //********************************************************************************************************************
148 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
149 Sequence::Sequence(ifstream& fastaFile){
151 m = MothurOut::getInstance();
153 name = getSequenceName(fastaFile);
155 if (!m->control_pressed) {
160 while ((name[0] == '#') && fastaFile) {
161 while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
162 sequence = getCommentString(fastaFile);
166 name = name.substr(1);
174 while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
177 sequence = getSequenceString(fastaFile, numAmbig);
179 setAligned(sequence);
180 //setUnaligned removes any gap characters for us
181 setUnaligned(sequence);
183 if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
188 catch(exception& e) {
189 m->errorOut(e, "Sequence", "Sequence");
193 //********************************************************************************************************************
194 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
195 Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){
197 m = MothurOut::getInstance();
201 name = getSequenceName(fastaFile);
203 if (!m->control_pressed) {
207 while ((name[0] == '#') && fastaFile) {
208 while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
209 sequence = getCommentString(fastaFile);
213 name = name.substr(1);
220 //read info after sequence name
221 while (!fastaFile.eof()) {
222 char c = fastaFile.get();
223 if (c == 10 || c == 13 || c == -1){ break; }
228 sequence = getSequenceString(fastaFile, numAmbig);
230 setAligned(sequence);
231 //setUnaligned removes any gap characters for us
232 setUnaligned(sequence);
234 if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
238 catch(exception& e) {
239 m->errorOut(e, "Sequence", "Sequence");
243 //********************************************************************************************************************
244 //this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
245 Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
247 m = MothurOut::getInstance();
249 name = getSequenceName(fastaFile);
251 if (!m->control_pressed) {
255 while ((name[0] == '#') && fastaFile) {
256 while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
257 sequence = getCommentString(fastaFile);
261 name = name.substr(1);
269 while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
272 sequence = getSequenceString(fastaFile, numAmbig);
274 //setUnaligned removes any gap characters for us
275 setUnaligned(sequence);
277 if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
282 catch(exception& e) {
283 m->errorOut(e, "Sequence", "Sequence");
287 //********************************************************************************************************************
288 string Sequence::getSequenceName(ifstream& fastaFile) {
294 if (name.length() != 0) {
296 name = name.substr(1);
298 for (int i = 0; i < name.length(); i++) {
299 if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
302 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; }
306 catch(exception& e) {
307 m->errorOut(e, "Sequence", "getSequenceName");
311 //********************************************************************************************************************
312 string Sequence::getSequenceName(istringstream& fastaFile) {
318 if (name.length() != 0) {
320 name = name.substr(1);
322 for (int i = 0; i < name.length(); i++) {
323 if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; }
326 }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; }
330 catch(exception& e) {
331 m->errorOut(e, "Sequence", "getSequenceName");
335 //********************************************************************************************************************
336 string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) {
339 string sequence = "";
343 letter= fastaFile.get();
345 fastaFile.putback(letter);
347 }else if (letter == ' ') {;}
348 else if(isprint(letter)){
349 letter = toupper(letter);
350 if(letter == 'U'){letter = 'T';}
351 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G' && letter != 'C' && letter != 'N'){
361 catch(exception& e) {
362 m->errorOut(e, "Sequence", "getSequenceString");
366 //********************************************************************************************************************
367 //comment can contain '>' so we need to account for that
368 string Sequence::getCommentString(ifstream& fastaFile) {
371 string sequence = "";
374 letter=fastaFile.get();
375 if((letter == '\r') || (letter == '\n')){
376 m->gobble(fastaFile); //in case its a \r\n situation
383 catch(exception& e) {
384 m->errorOut(e, "Sequence", "getCommentString");
388 //********************************************************************************************************************
389 string Sequence::getSequenceString(istringstream& fastaFile, int& numAmbig) {
392 string sequence = "";
395 while(!fastaFile.eof()){
396 letter= fastaFile.get();
399 fastaFile.putback(letter);
401 }else if (letter == ' ') {;}
402 else if(isprint(letter)){
403 letter = toupper(letter);
404 if(letter == 'U'){letter = 'T';}
405 if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G' && letter != 'C' && letter != 'N'){
415 catch(exception& e) {
416 m->errorOut(e, "Sequence", "getSequenceString");
420 //********************************************************************************************************************
421 //comment can contain '>' so we need to account for that
422 string Sequence::getCommentString(istringstream& fastaFile) {
425 string sequence = "";
428 letter=fastaFile.get();
429 if((letter == '\r') || (letter == '\n')){
430 m->gobble(fastaFile); //in case its a \r\n situation
437 catch(exception& e) {
438 m->errorOut(e, "Sequence", "getCommentString");
442 //********************************************************************************************************************
444 void Sequence::initialize(){
456 longHomoPolymer = -1;
461 //********************************************************************************************************************
463 void Sequence::setName(string seqName) {
464 if(seqName[0] == '>') { name = seqName.substr(1); }
465 else { name = seqName; }
468 //********************************************************************************************************************
470 void Sequence::setUnaligned(string sequence){
472 if(sequence.find_first_of('.') != string::npos || sequence.find_first_of('-') != string::npos) {
474 for(int j=0;j<sequence.length();j++) {
475 if(isalpha(sequence[j])) { temp += sequence[j]; }
480 unaligned = sequence;
482 numBases = unaligned.length();
486 //********************************************************************************************************************
488 void Sequence::setAligned(string sequence){
490 //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
492 alignmentLength = aligned.length();
493 setUnaligned(sequence);
495 if(aligned[0] == '-'){
496 for(int i=0;i<alignmentLength;i++){
497 if(aligned[i] == '-'){
504 for(int i=alignmentLength-1;i>=0;i--){
505 if(aligned[i] == '-'){
516 //********************************************************************************************************************
518 void Sequence::setPairwise(string sequence){
522 //********************************************************************************************************************
524 string Sequence::convert2ints() {
526 if(unaligned == "") { /* need to throw an error */ }
530 for(int i=0;i<unaligned.length();i++) {
531 if(toupper(unaligned[i]) == 'A') { processed += '0'; }
532 else if(toupper(unaligned[i]) == 'C') { processed += '1'; }
533 else if(toupper(unaligned[i]) == 'G') { processed += '2'; }
534 else if(toupper(unaligned[i]) == 'T') { processed += '3'; }
535 else if(toupper(unaligned[i]) == 'U') { processed += '3'; }
536 else { processed += '4'; }
541 //********************************************************************************************************************
543 string Sequence::getName(){
547 //********************************************************************************************************************
549 string Sequence::getAligned(){
550 if(isAligned == 0) { return unaligned; }
551 else { return aligned; }
554 //********************************************************************************************************************
556 string Sequence::getInlineSeq(){
557 return name + '\t' + aligned;
561 //********************************************************************************************************************
563 string Sequence::getPairwise(){
567 //********************************************************************************************************************
569 string Sequence::getUnaligned(){
573 //********************************************************************************************************************
575 int Sequence::getNumBases(){
578 //********************************************************************************************************************
580 int Sequence::getNumNs(){
582 for (int i = 0; i < unaligned.length(); i++) {
583 if(toupper(unaligned[i]) == 'N') { numNs++; }
588 //********************************************************************************************************************
590 void Sequence::printSequence(ostream& out){
592 out << ">" << name << endl;
594 out << aligned << endl;
597 out << unaligned << endl;
601 //********************************************************************************************************************
603 int Sequence::getAlignLength(){
604 return alignmentLength;
607 //********************************************************************************************************************
609 int Sequence::getAmbigBases(){
610 if(ambigBases == -1){
612 for(int j=0;j<numBases;j++){
613 if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
622 //********************************************************************************************************************
624 void Sequence::removeAmbigBases(){
626 for(int j=0;j<alignmentLength;j++){
627 if(aligned[j] != 'A' && aligned[j] != 'T' && aligned[j] != 'G' && aligned[j] != 'C'){
631 setUnaligned(aligned);
634 //********************************************************************************************************************
636 int Sequence::getLongHomoPolymer(){
637 if(longHomoPolymer == -1){
640 for(int j=1;j<numBases;j++){
641 if(unaligned[j] == unaligned[j-1]){
645 if(homoPolymer > longHomoPolymer){ longHomoPolymer = homoPolymer; }
649 if(homoPolymer > longHomoPolymer){ longHomoPolymer = homoPolymer; }
651 return longHomoPolymer;
654 //********************************************************************************************************************
656 int Sequence::getStartPos(){
658 for(int j = 0; j < alignmentLength; j++) {
659 if((aligned[j] != '.')&&(aligned[j] != '-')){
665 if(isAligned == 0){ startPos = 1; }
670 //********************************************************************************************************************
672 void Sequence::padToPos(int start){
674 for(int j = startPos-1; j < start-1; j++) {
680 //********************************************************************************************************************
682 int Sequence::filterToPos(int start){
684 if (start > aligned.length()) { start = aligned.length(); m->mothurOut("[ERROR]: start to large.\n"); }
686 for(int j = 0; j < start; j++) {
690 //things like ......----------AT become ................AT
691 for(int j = start; j < aligned.length(); j++) {
692 if (isalpha(aligned[j])) { break; }
693 else { aligned[j] = '.'; }
695 setUnaligned(aligned);
700 //********************************************************************************************************************
702 int Sequence::filterFromPos(int end){
704 if (end > aligned.length()) { end = aligned.length(); m->mothurOut("[ERROR]: end to large.\n"); }
706 for(int j = end; j < aligned.length(); j++) {
710 for(int j = aligned.length()-1; j < 0; j--) {
711 if (isalpha(aligned[j])) { break; }
712 else { aligned[j] = '.'; }
715 setUnaligned(aligned);
719 //********************************************************************************************************************
721 int Sequence::getEndPos(){
723 for(int j=alignmentLength-1;j>=0;j--){
724 if((aligned[j] != '.')&&(aligned[j] != '-')){
730 if(isAligned == 0){ endPos = numBases; }
735 //********************************************************************************************************************
737 void Sequence::padFromPos(int end){
738 //cout << end << '\t' << endPos << endl;
739 for(int j = end; j < endPos; j++) {
746 //********************************************************************************************************************
748 bool Sequence::getIsAligned(){
751 //********************************************************************************************************************
753 void Sequence::reverseComplement(){
756 for(int i=numBases-1;i>=0;i--){
757 if(unaligned[i] == 'A') { temp += 'T'; }
758 else if(unaligned[i] == 'T'){ temp += 'A'; }
759 else if(unaligned[i] == 'G'){ temp += 'C'; }
760 else if(unaligned[i] == 'C'){ temp += 'G'; }
761 else { temp += 'N'; }
768 //********************************************************************************************************************
770 void Sequence::trim(int length){
772 if(numBases > length){
773 unaligned = unaligned.substr(0,length);
781 ///**************************************************************************************************/