]> git.donarmstrong.com Git - mothur.git/blob - chimera.cpp
chimeras, fix to sabundvector and sharedsabundvector that caused getRabundVector...
[mothur.git] / chimera.cpp
1 /*
2  *  chimera.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 8/11/09.
6  *  Copyright 2009 Schloss Lab Umass Amherst. All rights reserved.
7  *
8  */
9
10 #include "chimera.h"
11
12 //***************************************************************************************************************
13 //this is a vertical soft filter
14 string Chimera::createFilter(vector<Sequence*> seqs, float t) {
15         try {
16                 filterString = "";
17                 int threshold = int (t * seqs.size());
18 //cout << "threshhold = " << threshold << endl;
19                 
20                 vector<int> gaps;       gaps.resize(seqs[0]->getAligned().length(), 0);
21                 vector<int> a;          a.resize(seqs[0]->getAligned().length(), 0);
22                 vector<int> t;          t.resize(seqs[0]->getAligned().length(), 0);
23                 vector<int> g;          g.resize(seqs[0]->getAligned().length(), 0);
24                 vector<int> c;          c.resize(seqs[0]->getAligned().length(), 0);
25                 
26                 filterString = (string(seqs[0]->getAligned().length(), '1'));
27                 
28                 //for each sequence
29                 for (int i = 0; i < seqs.size(); i++) {
30                 
31                         string seqAligned = seqs[i]->getAligned();
32                         
33                         for (int j = 0; j < seqAligned.length(); j++) {
34                                 //if this spot is a gap
35                                 if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
36                                 else if (toupper(seqAligned[j]) == 'A')                                 {       a[j]++;         }
37                                 else if (toupper(seqAligned[j]) == 'T')                                 {       t[j]++;         }
38                                 else if (toupper(seqAligned[j]) == 'G')                                 {       g[j]++;         }
39                                 else if (toupper(seqAligned[j]) == 'C')                                 {       c[j]++;         }
40                         }
41                 }
42                 
43                 //zero out spot where all sequences have blanks
44                 //zero out spot where all sequences have blanks
45                 int numColRemoved = 0;
46                 for(int i = 0;i < seqs[0]->getAligned().length(); i++){
47                         if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
48                         
49                         else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {      filterString[i] = '0';  numColRemoved++;  }
50                         //cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
51                 }
52         
53 //cout << "filter = " << filterString << endl;  
54
55                 mothurOut("Filter removed " + toString(numColRemoved) + " columns.");  mothurOutEndLine();
56                 return filterString;
57         }
58         catch(exception& e) {
59                 errorOut(e, "Chimera", "createFilter");
60                 exit(1);
61         }
62 }
63 //***************************************************************************************************************
64 map<int, int> Chimera::runFilter(Sequence* seq) {
65         try {
66                 map<int, int> maskMap;
67                 string seqAligned = seq->getAligned();
68                 string newAligned = "";
69                 int count = 0;
70                         
71                 for (int j = 0; j < seqAligned.length(); j++) {
72                         //if this spot is a gap
73                         if (filterString[j] == '1') { 
74                                 newAligned += seqAligned[j]; 
75                                 maskMap[count] = j;
76                                 count++;
77                         }
78                 }
79                         
80                 seq->setAligned(newAligned);
81                 
82                 return maskMap;
83         }
84         catch(exception& e) {
85                 errorOut(e, "Chimera", "runFilter");
86                 exit(1);
87         }
88 }
89 //***************************************************************************************************************
90 vector<Sequence*> Chimera::readSeqs(string file) {
91         try {
92         
93                 mothurOut("Reading sequences... "); cout.flush();
94                 ifstream in;
95                 openInputFile(file, in);
96                 vector<Sequence*> container;
97                 int count = 0;
98                 int length = 0;
99                 unaligned = false;
100                 
101                 //read in seqs and store in vector
102                 while(!in.eof()){
103                         
104                         Sequence* current = new Sequence(in);  gobble(in);
105                         
106                         if (count == 0) {  length = current->getAligned().length();  count++;  } //gets first seqs length
107                         else if (length != current->getAligned().length()) { //seqs are unaligned
108                                 unaligned = true;
109                         }
110                         
111                         if (current->getName() != "") {  container.push_back(current);  }
112                 }
113                 
114                 in.close();
115                 mothurOut("Done."); mothurOutEndLine();
116                 
117                 return container;
118         }
119         catch(exception& e) {
120                 errorOut(e, "Chimera", "readSeqs");
121                 exit(1);
122         }
123 }
124 //***************************************************************************************************************
125 void Chimera::setMask(string filename) {
126         try {
127                 
128                 if (filename == "default") {
129                         //default is from wigeon  236627 EU009184.1 Shigella dysenteriae str. FBD013
130                         seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................";
131                 }else if (filename == "") {  //do nothing 
132                         seqMask = "";
133                 }else{
134                         ifstream infile;
135                         openInputFile(filename, infile);
136                         
137                         while (!infile.eof()) {
138                                 Sequence temp(infile);
139                                 seqMask = temp.getAligned();
140                                 
141                                 gobble(infile);
142                         }
143                         
144                         infile.close();
145                 }
146         }
147         catch(exception& e) {
148                 errorOut(e, "Chimera", "setMask");
149                 exit(1);
150         }
151 }
152 //***************************************************************************************************************
153
154 vector< vector<float> > Chimera::readQuantiles() {
155         try {
156         
157                 ifstream in;
158                 openInputFile(quanfile, in);
159                 
160                 vector< vector<float> > quan;
161                 vector <float> temp; temp.resize(6, 0);
162                 
163                 //to fill 0
164                 quan.push_back(temp); 
165         
166                 int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
167                 
168                 while(!in.eof()){
169                         
170                         in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
171                         
172                         temp.clear();
173                         
174                         temp.push_back(ten); 
175                         temp.push_back(twentyfive);
176                         temp.push_back(fifty);
177                         temp.push_back(seventyfive);
178                         temp.push_back(ninetyfive);
179                         temp.push_back(ninetynine);
180                         
181                         quan.push_back(temp);  
182         
183                         gobble(in);
184                 }
185                 
186                 in.close();
187                 return quan;
188                 
189         }
190         catch(exception& e) {
191                 errorOut(e, "Chimera", "readQuantiles");
192                 exit(1);
193         }
194 }
195 //***************************************************************************************************************
196 Sequence* Chimera::getSequence(string name) {
197         try{
198                 Sequence* temp;
199                 
200                 //look through templateSeqs til you find it
201                 int spot = -1;
202                 for (int i = 0; i < templateSeqs.size(); i++) {
203                         if (name == templateSeqs[i]->getName()) {  
204                                 spot = i;
205                                 break;
206                         }
207                 }
208                 
209                 if(spot == -1) { mothurOut("Error: Could not find sequence."); mothurOutEndLine(); return NULL; }
210                 
211                 temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
212                 
213                 return temp;
214         }
215         catch(exception& e) {
216                 errorOut(e, "Chimera", "getSequence");
217                 exit(1);
218         }
219 }
220 //***************************************************************************************************************
221
222
223
224