]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
1.9
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : Chimera() {  
17         fastafile = filename;  
18         outputDir = o; 
19         templateFileName = temp;  templateSeqs = readSeqs(temp);
20         setMask(mask);
21         filter = f;
22         window = win;
23         numWanted = numW;
24         
25         distCalc = new eachGapDist();
26         decalc = new DeCalculator();
27         
28         mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo";
29         
30         #ifdef USE_MPI
31                 
32                 char* inFileName = new char[mapInfo.length()];\r
33                 memcpy(inFileName, mapInfo.c_str(), mapInfo.length());
34                 
35                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
36
37                 MPI_File_open(MPI_COMM_WORLD, inFileName, outMode, MPI_INFO_NULL, &outMap);  //comm, filename, mode, info, filepointer
38                 
39                 delete inFileName;
40
41                 int pid;
42                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
43                 
44                 if (pid == 0) {
45                         string outString = "Place in masked, filtered and trimmed sequence\tPlace in original alignment\n";
46                         
47                         MPI_Status status;
48                         int length = outString.length();
49                         char* buf2 = new char[length];\r
50                         memcpy(buf2, outString.c_str(), length);
51                                 
52                         MPI_File_write_shared(outMap, buf2, length, MPI_CHAR, &status);
53                         delete buf2;
54                 }
55         #else
56
57                 ofstream out2;
58                 openOutputFile(mapInfo, out2);
59                 
60                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
61                 out2.close();
62         #endif
63 }
64 //***************************************************************************************************************
65 Ccode::~Ccode() {
66         delete distCalc;
67         delete decalc;
68         
69         #ifdef USE_MPI
70                 MPI_File_close(&outMap);
71         #endif
72 }       
73 //***************************************************************************************************************
74 int Ccode::print(ostream& out, ostream& outAcc) {
75         try {
76                 
77                 ofstream out2;
78                 openOutputFileAppend(mapInfo, out2);
79                 
80                 out2 << querySeq->getName() << endl;
81                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
82                         out2 << it->first << '\t' << it->second << endl;
83                 }
84                 out2.close();
85                 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
86                         
87                 for (int j = 0; j < closest.size(); j++) {
88                         out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
89                 }
90                 out << endl << endl;
91                 
92                 //for each window
93                 //window mapping info.
94                 out << "Mapping information: ";
95                 //you mask and did not filter
96                 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
97                                 
98                 //you filtered and did not mask
99                 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
100                                 
101                 //you masked and filtered
102                 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
103                         
104                 out << endl << "Window\tStartPos\tEndPos" << endl;
105                 it = trim.begin();
106                 for (int k = 0; k < windows.size()-1; k++) {
107                         out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
108                 }
109                         
110                 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
111                 out << endl;
112                 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
113                 for (int k = 0; k < windows.size(); k++) {
114                         float ds = averageQuery[k] / averageRef[k]; 
115                         out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
116                 }
117                 out << endl;
118                         
119                 //varRef
120                 //varQuery
121                 /* F test for differences among variances.
122                 * varQuery is expected to be higher or similar than varRef */
123                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
124                         
125                 bool results = false;   
126                                         
127                 //confidence limit, t - Student, anova
128                 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
129                         
130                 for (int k = 0; k < windows.size(); k++) {
131                         string temp = "";
132                         if (isChimericConfidence[k]) {  temp += "*\t"; }
133                         else { temp += "\t"; }
134                                 
135                         if (isChimericTStudent[k]) {  temp += "*\t"; }
136                         else { temp += "\t"; }
137                                 
138                         if (isChimericANOVA[k]) {  temp += "*\t"; }
139                         else { temp += "\t"; }
140                         
141                         out << k+1 << '\t' << temp << endl;
142                                 
143                         if (temp == "*\t*\t*\t") {  results = true;  }
144                 }
145                 out << endl;    
146                         
147                 if (results) {
148                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
149                         outAcc << querySeq->getName() << endl;
150                 }
151
152                 //free memory
153                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
154
155                 return results;
156         }
157         catch(exception& e) {
158                 m->errorOut(e, "Ccode", "print");
159                 exit(1);
160         }
161 }
162 #ifdef USE_MPI
163 //***************************************************************************************************************
164 int Ccode::print(MPI_File& out, MPI_File& outAcc) {
165         try {
166                 
167                 string outMapString = "";
168                 
169                 outMapString += querySeq->getName() + "\n";
170                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
171                         outMapString += toString(it->first)  + "\t" + toString(it->second)  + "\n";
172                 }
173                 printMapping(outMapString);
174                 outMapString = "";
175                 
176                 string outString = "";
177                 string outAccString = "";
178                 
179                 outString +=  querySeq->getName() + "\n\nReference sequences used and distance to query:\n";
180                         
181                 for (int j = 0; j < closest.size(); j++) {
182                         outString += closest[j].seq->getName() + "\t" + toString(closest[j].dist) + "\n";
183                 }
184                 outString += "\n\nMapping information: ";
185                 
186                 //for each window
187                 //window mapping info.
188                 //you mask and did not filter
189                 if ((seqMask != "") && (!filter)) { outString += "mask and trim."; }
190                                 
191                 //you filtered and did not mask
192                 if ((seqMask == "") && (filter)) { outString += "filter and trim."; }
193                                 
194                 //you masked and filtered
195                 if ((seqMask != "") && (filter)) { outString += "mask, filter and trim."; }
196                         
197                 outString += "\nWindow\tStartPos\tEndPos\n";
198                 it = trim.begin();
199                 for (int k = 0; k < windows.size()-1; k++) {
200                         outString += toString(k+1) + "\t" + toString(spotMap[windows[k]-it->first]) + "\t" + toString(spotMap[windows[k]-it->first+windowSizes]) + "\n";
201                 }
202                         
203                 outString += toString(windows.size()) + "\t" + toString(spotMap[windows[windows.size()-1]-it->first]) + "\t" + toString(spotMap[it->second-it->first-1]) + "\n\n";
204                 
205                 outString += "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova\n";
206                 for (int k = 0; k < windows.size(); k++) {
207                         float ds = averageQuery[k] / averageRef[k]; 
208                         outString += toString(k+1) + "\t" + toString(averageQuery[k]) + "\t" + toString(sdQuery[k]) + "\t" + toString(averageRef[k]) + "\t" + toString(sdRef[k]) + "\t" + toString(ds) + "\t" + toString(anova[k]) + "\n";
209                 }
210                         
211                 //varRef
212                 //varQuery
213                 /* F test for differences among variances.
214                 * varQuery is expected to be higher or similar than varRef */
215                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
216                         
217                 bool results = false;   
218                                         
219                 //confidence limit, t - Student, anova
220                 outString += "\nWindow\tConfidenceLimit\tt-Student\tAnova\n";
221                         
222                 for (int k = 0; k < windows.size(); k++) {
223                         string temp = "";
224                         if (isChimericConfidence[k]) {  temp += "*\t"; }
225                         else { temp += "\t"; }
226                                 
227                         if (isChimericTStudent[k]) {  temp += "*\t"; }
228                         else { temp += "\t"; }
229                                 
230                         if (isChimericANOVA[k]) {  temp += "*\t"; }
231                         else { temp += "\t"; }
232                         
233                         outString += toString(k+1) + "\t" + temp + "\n";
234                                 
235                         if (temp == "*\t*\t*\t") {  results = true;  }
236                 }
237                 outString += "\n";      
238                 
239                 MPI_Status status;
240                 int length = outString.length();
241                 char* buf2 = new char[length];\r
242                 memcpy(buf2, outString.c_str(), length);
243                                 
244                 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
245                 delete buf2;
246
247                 if (results) {
248                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
249                         outAccString += querySeq->getName() + "\n";
250                         
251                         MPI_Status statusAcc;
252                         length = outAccString.length();
253                         char* buf = new char[length];\r
254                         memcpy(buf, outAccString.c_str(), length);
255                                 
256                         MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
257                         delete buf;
258                 }
259
260                 //free memory
261                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
262
263                 return results;
264         }
265         catch(exception& e) {
266                 m->errorOut(e, "Ccode", "print");
267                 exit(1);
268         }
269 }
270 //***************************************************************************************************************
271 int Ccode::printMapping(string& output) {
272         try {
273                         MPI_Status status;
274                         int length = output.length();
275                         char* buf = new char[length];\r
276                         memcpy(buf, output.c_str(), length);
277                                 
278                         MPI_File_write_shared(outMap, buf, length, MPI_CHAR, &status);
279                         delete buf;
280
281         }
282         catch(exception& e) {
283                 m->errorOut(e, "Ccode", "printMapping");
284                 exit(1);
285         }
286 }
287 #endif
288 //***************************************************************************************************************
289 int Ccode::getChimeras(Sequence* query) {
290         try {
291         
292                 closest.clear();
293                 refCombo = 0;
294                 sumRef.clear(); 
295                 varRef.clear(); 
296                 varQuery.clear(); 
297                 sdRef.clear(); 
298                 sdQuery.clear();     
299                 sumQuery.clear();
300                 sumSquaredRef.clear(); 
301                 sumSquaredQuery.clear(); 
302                 averageRef.clear();
303                 averageQuery.clear();
304                 anova.clear();
305                 isChimericConfidence.clear();
306                 isChimericTStudent.clear();
307                 isChimericANOVA.clear();
308                 trim.clear();
309                 spotMap.clear();
310                 windowSizes = window;
311                 windows.clear();
312
313         
314                 querySeq = query;
315                 
316                 //find closest matches to query
317                 closest = findClosest(query, numWanted);
318                 
319                 if (m->control_pressed) {  return 0;  }
320                 
321                 //initialize spotMap
322                 for (int i = 0; i < query->getAligned().length(); i++) {        spotMap[i] = i;         }
323         
324                 //mask sequences if the user wants to 
325                 if (seqMask != "") {
326                         decalc->setMask(seqMask);
327                         
328                         decalc->runMask(query);
329                         
330                         //mask closest
331                         for (int i = 0; i < closest.size(); i++) {      decalc->runMask(closest[i].seq);        }
332                         
333                         spotMap = decalc->getMaskMap();
334                 }
335                 
336                 if (filter) {
337                         vector<Sequence*> temp;
338                         for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
339                         temp.push_back(query);  
340                         
341                         createFilter(temp, 0.5);
342                 
343                         for (int i = 0; i < temp.size(); i++) { 
344                                 if (m->control_pressed) {  return 0;  }
345                                 runFilter(temp[i]);  
346                         }
347                         
348                         //update spotMap
349                         map<int, int> newMap;
350                         int spot = 0;
351                         
352                         for (int i = 0; i < filterString.length(); i++) {
353                                 if (filterString[i] == '1') {
354                                         //add to newMap
355                                         newMap[spot] = spotMap[i];
356                                         spot++;  
357                                 }
358                         }
359                         spotMap = newMap;
360                 }
361
362                 //trim sequences - this follows ccodes remove_extra_gaps 
363                 trimSequences(query);
364                 if (m->control_pressed) {  return 0;  }
365                 
366                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
367                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
368                 windows = findWindows();  
369                 if (m->control_pressed) {  return 0;  }
370
371                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
372                 removeBadReferenceSeqs(closest);
373                 if (m->control_pressed) {  return 0;  }
374                 
375                 //find the averages for each querys references
376                 getAverageRef(closest);  //fills sumRef, averageRef, sumSquaredRef and refCombo.
377                 getAverageQuery(closest, query);  //fills sumQuery, averageQuery, sumSquaredQuery.
378                 if (m->control_pressed) {  return 0;  }                 
379                 
380                 //find the averages for each querys references 
381                 findVarianceRef();  //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
382                 if (m->control_pressed) {  return 0;  } 
383                         
384                 //find the averages for the query 
385                 findVarianceQuery();  //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
386                 if (m->control_pressed) {  return 0;  }
387                                         
388                 determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. 
389                 if (m->control_pressed) {  return 0;  }
390                 
391                 return 0;
392         }
393         catch(exception& e) {
394                 m->errorOut(e, "Ccode", "getChimeras");
395                 exit(1);
396         }
397 }
398 /***************************************************************************************************************/
399 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
400 void Ccode::trimSequences(Sequence* query) {
401         try {
402                 
403                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
404                 int rearPos = query->getAligned().length();
405                 
406                 //********find first position in closest seqs that is a non gap character***********//
407                 //find first position all query seqs that is a non gap character
408                 for (int i = 0; i < closest.size(); i++) {
409                         
410                         string aligned = closest[i].seq->getAligned();
411                         int pos = 0;
412                         
413                         //find first spot in this seq
414                         for (int j = 0; j < aligned.length(); j++) {
415                                 if (isalpha(aligned[j])) {
416                                         pos = j;
417                                         break;
418                                 }
419                         }
420                         
421                         //save this spot if it is the farthest
422                         if (pos > frontPos) { frontPos = pos; }
423                 }
424                 
425                 //find first position all querySeq[query] that is a non gap character
426                 string aligned = query->getAligned();
427                 int pos = 0;
428                         
429                 //find first spot in this seq
430                 for (int j = 0; j < aligned.length(); j++) {
431                         if (isalpha(aligned[j])) {
432                                 pos = j;
433                                 break;
434                         }
435                 }
436                 
437                 //save this spot if it is the farthest
438                 if (pos > frontPos) { frontPos = pos; }
439                 
440                 
441                 //********find last position in closest seqs that is a non gap character***********//
442                 for (int i = 0; i < closest.size(); i++) {
443                         
444                         string aligned = closest[i].seq->getAligned();
445                         int pos = aligned.length();
446                         
447                         //find first spot in this seq
448                         for (int j = aligned.length()-1; j >= 0; j--) {
449                                 if (isalpha(aligned[j])) {
450                                         pos = j;
451                                         break;
452                                 }
453                         }
454                         
455                         //save this spot if it is the farthest
456                         if (pos < rearPos) { rearPos = pos; }
457                 }
458                 
459                 //find last position all querySeqs[query] that is a non gap character
460                 aligned = query->getAligned();
461                 pos = aligned.length();
462                 
463                 //find first spot in this seq
464                 for (int j = aligned.length()-1; j >= 0; j--) {
465                         if (isalpha(aligned[j])) {
466                                 pos = j;
467                                 break;
468                         }
469                 }
470                 
471                 //save this spot if it is the farthest
472                 if (pos < rearPos) { rearPos = pos; }
473
474                 
475                 //check to make sure that is not whole seq
476                 if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1);  }
477                 
478                 map<int, int> tempTrim;
479                 tempTrim[frontPos] = rearPos;
480                 
481                 //save trimmed locations
482                 trim = tempTrim;
483                                                 
484                 //update spotMask
485                 map<int, int> newMap;
486                 int spot = 0;
487                 
488                 for (int i = frontPos; i < rearPos; i++) {
489                         //add to newMap
490                         newMap[spot] = spotMap[i];
491                         spot++;  
492                 }
493                 spotMap = newMap;
494         }
495         catch(exception& e) {
496                 m->errorOut(e, "Ccode", "trimSequences");
497                 exit(1);
498         }
499 }
500 /***************************************************************************************************************/
501 vector<int> Ccode::findWindows() {
502         try {
503                 
504                 vector<int> win; 
505                 it = trim.begin();
506                 
507                 int length = it->second - it->first;
508         
509                 //default is wanted = 10% of total length
510                 if (windowSizes > length) { 
511                         m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
512                         windowSizes = length / 10;
513                 }else if (windowSizes == 0) { windowSizes = length / 10;  }
514                 else if (windowSizes > (length * 0.20)) {
515                         m->mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
516                 }else if (windowSizes < (length * 0.05)) {
517                         m->mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
518                 }
519                 
520                 //save starting points of each window
521                 for (int m = it->first;  m < (it->second-windowSizes); m+=windowSizes) {  win.push_back(m);  }
522                 
523                 //save last window
524                 if (win[win.size()-1] < (it->first+length)) {
525                         win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
526                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
527                 
528                 return win;
529         }
530         catch(exception& e) {
531                 m->errorOut(e, "Ccode", "findWindows");
532                 exit(1);
533         }
534 }
535 //***************************************************************************************************************
536 int Ccode::getDiff(string seqA, string seqB) {
537         try {
538                 
539                 int numDiff = 0;
540                 
541                 for (int i = 0; i < seqA.length(); i++) {
542                         //if you are both not gaps
543                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
544                                 //are you different
545                                 if (seqA[i] != seqB[i]) { 
546                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
547
548                                         /* the char in base_a and base_b have been checked and they are different */
549                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
550                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
551                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
552                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
553                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
554                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
555                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
556                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
557                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
558                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
559                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
560                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
561                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
562                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
563                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
564                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
565                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
566                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
567                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
568                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
569                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
570                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
571                                         else ok = 0;  /* the bases are different and not equivalent */
572                                         
573                                         //check if they are both blanks
574                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
575                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
576                                         
577                                         if (ok == 0) {  numDiff++;  }
578                                 }
579                         //}
580                 }
581                 
582                 return numDiff;
583         
584         }
585         catch(exception& e) {
586                 m->errorOut(e, "Ccode", "getDiff");
587                 exit(1);
588         }
589 }
590 //***************************************************************************************************************
591 //tried to make this look most like ccode original implementation
592 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
593         try {
594                 
595                 vector< vector<int> > numDiffBases;
596                 numDiffBases.resize(seqs.size());
597                 //initialize to 0
598                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
599                 
600                 it = trim.begin();
601                 int length = it->second - it->first;
602                 
603                 //calc differences from each sequence to everyother seq in the set
604                 for (int i = 0; i < seqs.size(); i++) {
605                         
606                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
607                         
608                         //so you don't calc i to j and j to i since they are the same
609                         for (int j = 0; j < i; j++) {
610                                 
611                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
612                                 
613                                 //compare strings
614                                 int numDiff = getDiff(seqA, seqB);
615                                 
616                                 numDiffBases[i][j] = numDiff;
617                                 numDiffBases[j][i] = numDiff;
618                         }
619                 }
620                 
621                 //initailize remove to 0
622                 vector<int> remove;  remove.resize(seqs.size(), 0);
623                 float top = ((20*length) / (float) 100);
624                 float bottom = ((0.5*length) / (float) 100);
625                 
626                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
627                 for (int i = 0; i < numDiffBases.size(); i++) {
628                         for (int j = 0; j < i; j++) {   
629                                 //are you more than 20% different
630                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
631                                 //are you less than 0.5% different
632                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
633                         }
634                 }
635                 
636                 int numSeqsLeft = 0;
637                 
638                 //count seqs that are not going to be removed
639                 for (int i = 0; i < remove.size(); i++) {  
640                         if (remove[i] == 0)  { numSeqsLeft++;  }
641                 }
642                 
643                 //if you have enough then remove bad ones
644                 if (numSeqsLeft >= 3) {
645                         vector<SeqDist> goodSeqs;
646                         //remove bad seqs
647                         for (int i = 0; i < remove.size(); i++) {
648                                 if (remove[i] == 0) { 
649                                         goodSeqs.push_back(seqs[i]);
650                                 }
651                         }
652                         
653                         seqs = goodSeqs;
654                         
655                 }else { //warn, but dont remove any
656                         m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); m->mothurOutEndLine();  
657                 }
658
659         }
660         catch(exception& e) {
661                 m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
662                 exit(1);
663         }
664 }
665 //***************************************************************************************************************
666 //makes copy of templateseq for filter
667 vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
668         try{
669         
670                 vector<SeqDist>  topMatches;  
671                 
672                 Sequence query = *(q);
673                         
674                 //calc distance to each sequence in template seqs
675                 for (int i = 0; i < templateSeqs.size(); i++) {
676                         
677                         Sequence ref = *(templateSeqs[i]); 
678                                         
679                         //find overall dist
680                         distCalc->calcDist(query, ref);
681                         float dist = distCalc->getDist();       
682                                 
683                         //save distance
684                         SeqDist temp;
685                         temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
686                         temp.dist = dist;
687
688                         topMatches.push_back(temp);
689                 }
690                         
691                 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
692                 
693                 for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
694                 
695                 topMatches.resize(numWanted);
696                         
697                 return topMatches;
698
699         }
700         catch(exception& e) {
701                 m->errorOut(e, "Ccode", "findClosestSides");
702                 exit(1);
703         }
704 }
705 /**************************************************************************************************/
706 //find the distances from each reference sequence to every other reference sequence for each window for this query
707 void Ccode::getAverageRef(vector<SeqDist> ref) {
708         try {
709                 
710                 vector< vector< vector<int> > >  diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
711                 
712                 //initialize diffs vector
713                 diffs.resize(ref.size());
714                 for (int i = 0; i < diffs.size(); i++) {  
715                         diffs[i].resize(ref.size());
716                         for (int j = 0; j < diffs[i].size(); j++) {
717                                 diffs[i][j].resize(windows.size(), 0);
718                         }
719                 }
720                 
721                 it = trim.begin();
722                                 
723                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
724                 for (int i = 0; i < ref.size(); i++) {
725                         
726                         string refI = ref[i].seq->getAligned();
727                         
728                         //j<i, so you don't find distances from i to j and then j to i.
729                         for (int j = 0; j < i; j++) {
730                         
731                                 string refJ = ref[j].seq->getAligned();
732                         
733                                 for (int k = 0; k < windows.size(); k++) {
734                                         
735                                         string refIWindowk, refJWindowk;
736                                         
737                                         if (k < windows.size()-1) {
738                                                 //get window strings
739                                                 refIWindowk = refI.substr(windows[k], windowSizes);
740                                                 refJWindowk = refJ.substr(windows[k], windowSizes);
741                                         }else { //last window may be smaller than rest - see findwindows
742                                                 //get window strings
743                                                 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
744                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
745                                         }
746                                         
747                                         //find differences
748                                         int diff = getDiff(refIWindowk, refJWindowk);
749
750                                         //save differences in [i][j][k] and [j][i][k] since they are the same
751                                         diffs[i][j][k] = diff;
752                                         diffs[j][i][k] = diff;
753
754                                 }//k
755                                 
756                         }//j
757                 
758                 }//i
759                 
760                 //initialize sumRef for this query 
761                 sumRef.resize(windows.size(), 0);
762                 sumSquaredRef.resize(windows.size(), 0);
763                 averageRef.resize(windows.size(), 0);
764                 
765                 //find the sum of the differences for hte reference sequences
766                 for (int i = 0; i < diffs.size(); i++) {
767                         for (int j = 0; j < i; j++) {
768                         
769                                 //increment this querys reference sequences combos
770                                 refCombo++;
771                                 
772                                 for (int k = 0; k < diffs[i][j].size(); k++) {
773                                         sumRef[k] += diffs[i][j][k];
774                                         sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
775                                 }//k
776                                 
777                         }//j
778                 }//i
779
780                 
781                 //find the average of the differences for the references for each window
782                 for (int i = 0; i < windows.size(); i++) {
783                         averageRef[i] = sumRef[i] / (float) refCombo;
784                 }
785                 
786         }
787         catch(exception& e) {
788                 m->errorOut(e, "Ccode", "getAverageRef");
789                 exit(1);
790         }
791 }
792 /**************************************************************************************************/
793 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
794         try {
795         
796                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
797                 
798                 //initialize diffs vector
799                 diffs.resize(ref.size());
800                 for (int j = 0; j < diffs.size(); j++) {
801                         diffs[j].resize(windows.size(), 0);
802                 }
803                 
804                 it = trim.begin();
805                                                         
806                 string refQuery = query->getAligned();
807                          
808                 //j<i, so you don't find distances from i to j and then j to i.
809                 for (int j = 0; j < ref.size(); j++) {
810                          
811                          string refJ = ref[j].seq->getAligned();
812                          
813                          for (int k = 0; k < windows.size(); k++) {
814                                         
815                                         string QueryWindowk, refJWindowk;
816                                         
817                                         if (k < windows.size()-1) {
818                                                 //get window strings
819                                                 QueryWindowk = refQuery.substr(windows[k], windowSizes);
820                                                 refJWindowk = refJ.substr(windows[k], windowSizes);                                     
821                                         }else { //last window may be smaller than rest - see findwindows
822                                                 //get window strings
823                                                 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
824                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
825                                         }
826                                         
827                                         //find differences
828                                         int diff = getDiff(QueryWindowk, refJWindowk);
829                                         
830                                         //save differences 
831                                         diffs[j][k] = diff;
832                          
833                          }//k
834                 }//j
835                          
836                 
837                 //initialize sumRef for this query 
838                 sumQuery.resize(windows.size(), 0);
839                 sumSquaredQuery.resize(windows.size(), 0);
840                 averageQuery.resize(windows.size(), 0);
841                 
842                 //find the sum of the differences 
843                 for (int j = 0; j < diffs.size(); j++) {
844                         for (int k = 0; k < diffs[j].size(); k++) {
845                                 sumQuery[k] += diffs[j][k];
846                                 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
847                         }//k
848                 }//j
849                 
850                 
851                 //find the average of the differences for the references for each window
852                 for (int i = 0; i < windows.size(); i++) {
853                         averageQuery[i] = sumQuery[i] / (float) ref.size();
854                 }
855         }
856         catch(exception& e) {
857                 m->errorOut(e, "Ccode", "getAverageQuery");
858                 exit(1);
859         }
860 }
861 /**************************************************************************************************/
862 void Ccode::findVarianceRef() {
863         try {
864                 
865                 varRef.resize(windows.size(), 0);
866                 sdRef.resize(windows.size(), 0);
867                 
868                 //for each window
869                 for (int i = 0; i < windows.size(); i++) {
870                         varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
871                         sdRef[i] = sqrt(varRef[i]);
872                         
873                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
874                         if (averageRef[i] < 0.001)                      {       averageRef[i] = 0.001;          }
875                         if (sumRef[i] < 0.001)                          {       sumRef[i] = 0.001;                      }
876                         if (varRef[i] < 0.001)                          {       varRef[i] = 0.001;                      }
877                         if (sumSquaredRef[i] < 0.001)           {       sumSquaredRef[i] = 0.001;       }
878                         if (sdRef[i] < 0.001)                           {       sdRef[i] = 0.001;                       }
879                                 
880                 }
881         }
882         catch(exception& e) {
883                 m->errorOut(e, "Ccode", "findVarianceRef");
884                 exit(1);
885         }
886 }
887 /**************************************************************************************************/
888 void Ccode::findVarianceQuery() {
889         try {
890                 varQuery.resize(windows.size(), 0);
891                 sdQuery.resize(windows.size(), 0);
892                 
893                 //for each window
894                 for (int i = 0; i < windows.size(); i++) {
895                         varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
896                         sdQuery[i] = sqrt(varQuery[i]);
897                         
898                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
899                         if (averageQuery[i] < 0.001)                    {       averageQuery[i] = 0.001;                }
900                         if (sumQuery[i] < 0.001)                                {       sumQuery[i] = 0.001;                    }
901                         if (varQuery[i] < 0.001)                                {       varQuery[i] = 0.001;                    }
902                         if (sumSquaredQuery[i] < 0.001)         {       sumSquaredQuery[i] = 0.001;     }
903                         if (sdQuery[i] < 0.001)                         {       sdQuery[i] = 0.001;                     }
904                 }
905
906         }
907         catch(exception& e) {
908                 m->errorOut(e, "Ccode", "findVarianceQuery");
909                 exit(1);
910         }
911 }
912 /**************************************************************************************************/
913 void Ccode::determineChimeras() {
914         try {
915                 
916                 isChimericConfidence.resize(windows.size(), false);
917                 isChimericTStudent.resize(windows.size(), false);
918                 isChimericANOVA.resize(windows.size(), false);
919                 anova.resize(windows.size());
920
921                 
922                 //for each window
923                 for (int i = 0; i < windows.size(); i++) {
924                 
925                         //get confidence limits
926                         float t = getT(closest.size()-1);  //how many seqs you are comparing to this querySeq
927                         float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i]; 
928                         float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i]; 
929                 
930                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) {  /* range does not include 1 */
931                                         isChimericConfidence[i] = true;   /* significantly higher at P<0.05 */ 
932
933                         }
934                         
935                         //student t test
936                         int degreeOfFreedom = refCombo + closest.size() - 2;
937                         float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size()));    /* denominator, without sqrt(), for ts calculations */
938                                 
939                         float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT)));  /* value of ts for t-student test */   
940                         t = getT(degreeOfFreedom);
941         
942                         if ((ts >= t) && (averageQuery[i] > averageRef[i])) {   
943                                 isChimericTStudent[i] = true;   /* significantly higher at P<0.05 */ 
944                         }
945                 
946                         //anova test
947                         float value1 = sumQuery[i] + sumRef[i];
948                         float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
949                         float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
950                         float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
951                         float value5 = value2 - value4;
952                         float value6 = value3 - value4;
953                         float value7 = value5 - value6;
954                         float value8 = value7 / ((float) degreeOfFreedom);
955                         float anovaValue = value6 / value8;
956                         
957                         float f = getF(degreeOfFreedom);
958                         
959                          if ((anovaValue >= f) && (averageQuery[i] > averageRef[i]))  {
960                        isChimericANOVA[i] = true;   /* significant P<0.05 */
961                 }
962                         
963                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
964                         
965                         anova[i] = anovaValue;
966                 }
967                 
968         }
969         catch(exception& e) {
970                 m->errorOut(e, "Ccode", "determineChimeras");
971                 exit(1);
972         }
973 }
974 /**************************************************************************************************/    
975 float Ccode::getT(int numseq) {
976         try {
977         
978                 float tvalue = 0;
979                 
980                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
981                 if (numseq > 120) tvalue = 1.645;
982                 else if (numseq > 60) tvalue = 1.658;
983         else if (numseq > 40) tvalue = 1.671;
984         else if (numseq > 30) tvalue = 1.684;
985         else if (numseq > 29) tvalue = 1.697;
986         else if (numseq > 28) tvalue = 1.699;
987         else if (numseq > 27) tvalue = 1.701;
988         else if (numseq > 26) tvalue = 1.703;
989         else if (numseq > 25) tvalue = 1.706;
990         else if (numseq > 24) tvalue = 1.708;
991         else if (numseq > 23) tvalue = 1.711;
992         else if (numseq > 22) tvalue = 1.714;
993         else if (numseq > 21) tvalue = 1.717;
994         else if (numseq > 20) tvalue = 1.721;
995         else if (numseq > 19) tvalue = 1.725;
996         else if (numseq > 18) tvalue = 1.729;
997         else if (numseq > 17) tvalue = 1.734;
998         else if (numseq > 16) tvalue = 1.740;
999         else if (numseq > 15) tvalue = 1.746;
1000         else if (numseq > 14) tvalue = 1.753;
1001         else if (numseq > 13) tvalue = 1.761;
1002         else if (numseq > 12) tvalue = 1.771;
1003         else if (numseq > 11) tvalue = 1.782;
1004         else if (numseq > 10) tvalue = 1.796;
1005         else if (numseq > 9) tvalue = 1.812;
1006         else if (numseq > 8) tvalue = 1.833;
1007         else if (numseq > 7) tvalue = 1.860;
1008         else if (numseq > 6) tvalue = 1.895;
1009         else if (numseq > 5) tvalue = 1.943;
1010         else if (numseq > 4) tvalue = 2.015;
1011         else if (numseq > 3) tvalue = 2.132;
1012         else if (numseq > 2) tvalue = 2.353;
1013         else if (numseq > 1) tvalue = 2.920;
1014                 else if (numseq <= 1) {
1015                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1016                 }
1017                 
1018                 return tvalue;
1019         }
1020         catch(exception& e) {
1021                 m->errorOut(e, "Ccode", "getT");
1022                 exit(1);
1023         }
1024 }
1025 /**************************************************************************************************/    
1026 float Ccode::getF(int numseq) {
1027         try {
1028         
1029                 float fvalue = 0;
1030                 
1031                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
1032         if (numseq > 120) fvalue = 3.84;
1033         else if (numseq > 60) fvalue = 3.92;
1034         else if (numseq > 40) fvalue = 4.00;
1035         else if (numseq > 30) fvalue = 4.08;
1036         else if (numseq > 29) fvalue = 4.17;
1037         else if (numseq > 28) fvalue = 4.18;
1038         else if (numseq > 27) fvalue = 4.20;
1039         else if (numseq > 26) fvalue = 4.21;
1040         else if (numseq > 25) fvalue = 4.23;
1041         else if (numseq > 24) fvalue = 4.24;
1042         else if (numseq > 23) fvalue = 4.26;
1043         else if (numseq > 22) fvalue = 4.28;
1044         else if (numseq > 21) fvalue = 4.30;
1045         else if (numseq > 20) fvalue = 4.32;
1046         else if (numseq > 19) fvalue = 4.35;
1047         else if (numseq > 18) fvalue = 4.38;
1048         else if (numseq > 17) fvalue = 4.41;
1049         else if (numseq > 16) fvalue = 4.45;
1050         else if (numseq > 15) fvalue = 4.49;
1051         else if (numseq > 14) fvalue = 4.54;
1052         else if (numseq > 13) fvalue = 4.60;
1053         else if (numseq > 12) fvalue = 4.67;
1054         else if (numseq > 11) fvalue = 4.75;
1055         else if (numseq > 10) fvalue = 4.84;
1056         else if (numseq > 9) fvalue = 4.96;
1057         else if (numseq > 8) fvalue = 5.12;
1058         else if (numseq > 7) fvalue = 5.32;
1059         else if (numseq > 6) fvalue = 5.59;
1060         else if (numseq > 5) fvalue = 5.99;
1061         else if (numseq > 4) fvalue = 6.61;
1062         else if (numseq > 3) fvalue = 7.71;
1063         else if (numseq > 2) fvalue = 10.1;
1064         else if (numseq > 1) fvalue = 18.5;
1065         else if (numseq > 0) fvalue = 161;
1066                 else if (numseq <= 0) {
1067                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1068         }
1069                 
1070                 return fvalue;
1071         }
1072         catch(exception& e) {
1073                 m->errorOut(e, "Ccode", "getF");
1074                 exit(1);
1075         }
1076 }
1077 //***************************************************************************************************************
1078
1079
1080