]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
added MPI code, broke up chimera.seqs into 5 separated commands, added parse.sff...
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : Chimera() {  
17         fastafile = filename;  
18         outputDir = o; 
19         templateFileName = temp;  templateSeqs = readSeqs(temp);
20         setMask(mask);
21         filter = f;
22         window = win;
23         numWanted = numW;
24         
25         distCalc = new eachGapDist();
26         decalc = new DeCalculator();
27         
28         mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo";
29         
30         #ifdef USE_MPI
31                 
32                 char inFileName[mapInfo.length()];
33                 strcpy(inFileName, mapInfo.c_str());
34                 
35                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
36
37                 MPI_File_open(MPI_COMM_WORLD, inFileName, outMode, MPI_INFO_NULL, &outMap);  //comm, filename, mode, info, filepointer
38                 
39                 int pid;
40                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
41                 
42                 if (pid == 0) {
43                         string outString = "Place in masked, filtered and trimmed sequence\tPlace in original alignment\n";
44                         
45                         MPI_Status status;
46                         int length = outString.length();
47                         char buf2[length];
48                         strcpy(buf2, outString.c_str()); 
49                                 
50                         MPI_File_write_shared(outMap, buf2, length, MPI_CHAR, &status);
51                 }
52         #else
53
54                 ofstream out2;
55                 openOutputFile(mapInfo, out2);
56                 
57                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
58                 out2.close();
59         #endif
60 }
61 //***************************************************************************************************************
62 Ccode::~Ccode() {
63         delete distCalc;
64         delete decalc;
65         
66         #ifdef USE_MPI
67                 MPI_File_close(&outMap);
68         #endif
69 }       
70 //***************************************************************************************************************
71 int Ccode::print(ostream& out, ostream& outAcc) {
72         try {
73                 
74                 ofstream out2;
75                 openOutputFileAppend(mapInfo, out2);
76                 
77                 out2 << querySeq->getName() << endl;
78                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
79                         out2 << it->first << '\t' << it->second << endl;
80                 }
81                 out2.close();
82                 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
83                         
84                 for (int j = 0; j < closest.size(); j++) {
85                         out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
86                 }
87                 out << endl << endl;
88                 
89                 //for each window
90                 //window mapping info.
91                 out << "Mapping information: ";
92                 //you mask and did not filter
93                 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
94                                 
95                 //you filtered and did not mask
96                 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
97                                 
98                 //you masked and filtered
99                 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
100                         
101                 out << endl << "Window\tStartPos\tEndPos" << endl;
102                 it = trim.begin();
103                 for (int k = 0; k < windows.size()-1; k++) {
104                         out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
105                 }
106                         
107                 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
108                 out << endl;
109                 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
110                 for (int k = 0; k < windows.size(); k++) {
111                         float ds = averageQuery[k] / averageRef[k]; 
112                         out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
113                 }
114                 out << endl;
115                         
116                 //varRef
117                 //varQuery
118                 /* F test for differences among variances.
119                 * varQuery is expected to be higher or similar than varRef */
120                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
121                         
122                 bool results = false;   
123                                         
124                 //confidence limit, t - Student, anova
125                 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
126                         
127                 for (int k = 0; k < windows.size(); k++) {
128                         string temp = "";
129                         if (isChimericConfidence[k]) {  temp += "*\t"; }
130                         else { temp += "\t"; }
131                                 
132                         if (isChimericTStudent[k]) {  temp += "*\t"; }
133                         else { temp += "\t"; }
134                                 
135                         if (isChimericANOVA[k]) {  temp += "*\t"; }
136                         else { temp += "\t"; }
137                         
138                         out << k+1 << '\t' << temp << endl;
139                                 
140                         if (temp == "*\t*\t*\t") {  results = true;  }
141                 }
142                 out << endl;    
143                         
144                 if (results) {
145                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
146                         outAcc << querySeq->getName() << endl;
147                 }
148
149                 //free memory
150                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
151
152                 return results;
153         }
154         catch(exception& e) {
155                 m->errorOut(e, "Ccode", "print");
156                 exit(1);
157         }
158 }
159 #ifdef USE_MPI
160 //***************************************************************************************************************
161 int Ccode::print(MPI_File& out, MPI_File& outAcc) {
162         try {
163                 
164                 string outMapString = "";
165                 
166                 outMapString += querySeq->getName() + "\n";
167                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
168                         outMapString += toString(it->first)  + "\t" + toString(it->second)  + "\n";
169                 }
170                 printMapping(outMapString);
171                 outMapString = "";
172                 
173                 string outString = "";
174                 string outAccString = "";
175                 
176                 outString +=  querySeq->getName() + "\n\nReference sequences used and distance to query:\n";
177                         
178                 for (int j = 0; j < closest.size(); j++) {
179                         outString += closest[j].seq->getName() + "\t" + toString(closest[j].dist) + "\n";
180                 }
181                 outString += "\n\nMapping information: ";
182                 
183                 //for each window
184                 //window mapping info.
185                 //you mask and did not filter
186                 if ((seqMask != "") && (!filter)) { outString += "mask and trim."; }
187                                 
188                 //you filtered and did not mask
189                 if ((seqMask == "") && (filter)) { outString += "filter and trim."; }
190                                 
191                 //you masked and filtered
192                 if ((seqMask != "") && (filter)) { outString += "mask, filter and trim."; }
193                         
194                 outString += "\nWindow\tStartPos\tEndPos\n";
195                 it = trim.begin();
196                 for (int k = 0; k < windows.size()-1; k++) {
197                         outString += toString(k+1) + "\t" + toString(spotMap[windows[k]-it->first]) + "\t" + toString(spotMap[windows[k]-it->first+windowSizes]) + "\n";
198                 }
199                         
200                 outString += toString(windows.size()) + "\t" + toString(spotMap[windows[windows.size()-1]-it->first]) + "\t" + toString(spotMap[it->second-it->first-1]) + "\n\n";
201                 
202                 outString += "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova\n";
203                 for (int k = 0; k < windows.size(); k++) {
204                         float ds = averageQuery[k] / averageRef[k]; 
205                         outString += toString(k+1) + "\t" + toString(averageQuery[k]) + "\t" + toString(sdQuery[k]) + "\t" + toString(averageRef[k]) + "\t" + toString(sdRef[k]) + "\t" + toString(ds) + "\t" + toString(anova[k]) + "\n";
206                 }
207                         
208                 //varRef
209                 //varQuery
210                 /* F test for differences among variances.
211                 * varQuery is expected to be higher or similar than varRef */
212                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
213                         
214                 bool results = false;   
215                                         
216                 //confidence limit, t - Student, anova
217                 outString += "\nWindow\tConfidenceLimit\tt-Student\tAnova\n";
218                         
219                 for (int k = 0; k < windows.size(); k++) {
220                         string temp = "";
221                         if (isChimericConfidence[k]) {  temp += "*\t"; }
222                         else { temp += "\t"; }
223                                 
224                         if (isChimericTStudent[k]) {  temp += "*\t"; }
225                         else { temp += "\t"; }
226                                 
227                         if (isChimericANOVA[k]) {  temp += "*\t"; }
228                         else { temp += "\t"; }
229                         
230                         outString += toString(k+1) + "\t" + temp + "\n";
231                                 
232                         if (temp == "*\t*\t*\t") {  results = true;  }
233                 }
234                 outString += "\n";      
235                 
236                 MPI_Status status;
237                 int length = outString.length();
238                 char buf2[length];
239                 strcpy(buf2, outString.c_str()); 
240                                 
241                 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
242                         
243                 if (results) {
244                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
245                         outAccString += querySeq->getName() + "\n";
246                         
247                         MPI_Status statusAcc;
248                         length = outAccString.length();
249                         char buf[length];
250                         strcpy(buf, outAccString.c_str()); 
251                                 
252                         MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
253                 }
254
255                 //free memory
256                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
257
258                 return results;
259         }
260         catch(exception& e) {
261                 m->errorOut(e, "Ccode", "print");
262                 exit(1);
263         }
264 }
265 //***************************************************************************************************************
266 int Ccode::printMapping(string& output) {
267         try {
268                         MPI_Status status;
269                         int length = output.length();
270                         char buf[length];
271                         strcpy(buf, output.c_str()); 
272                                 
273                         MPI_File_write_shared(outMap, buf, length, MPI_CHAR, &status);
274
275         }
276         catch(exception& e) {
277                 m->errorOut(e, "Ccode", "printMapping");
278                 exit(1);
279         }
280 }
281 #endif
282 //***************************************************************************************************************
283 int Ccode::getChimeras(Sequence* query) {
284         try {
285         
286                 closest.clear();
287                 refCombo = 0;
288                 sumRef.clear(); 
289                 varRef.clear(); 
290                 varQuery.clear(); 
291                 sdRef.clear(); 
292                 sdQuery.clear();     
293                 sumQuery.clear();
294                 sumSquaredRef.clear(); 
295                 sumSquaredQuery.clear(); 
296                 averageRef.clear();
297                 averageQuery.clear();
298                 anova.clear();
299                 isChimericConfidence.clear();
300                 isChimericTStudent.clear();
301                 isChimericANOVA.clear();
302                 trim.clear();
303                 spotMap.clear();
304                 windowSizes = window;
305                 windows.clear();
306
307         
308                 querySeq = query;
309                 
310                 //find closest matches to query
311                 closest = findClosest(query, numWanted);
312                 
313                 if (m->control_pressed) {  return 0;  }
314                 
315                 //initialize spotMap
316                 for (int i = 0; i < query->getAligned().length(); i++) {        spotMap[i] = i;         }
317         
318                 //mask sequences if the user wants to 
319                 if (seqMask != "") {
320                         decalc->setMask(seqMask);
321                         
322                         decalc->runMask(query);
323                         
324                         //mask closest
325                         for (int i = 0; i < closest.size(); i++) {      decalc->runMask(closest[i].seq);        }
326                         
327                         spotMap = decalc->getMaskMap();
328                 }
329                 
330                 if (filter) {
331                         vector<Sequence*> temp;
332                         for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
333                         temp.push_back(query);  
334                         
335                         createFilter(temp, 0.5);
336                 
337                         for (int i = 0; i < temp.size(); i++) { 
338                                 if (m->control_pressed) {  return 0;  }
339                                 runFilter(temp[i]);  
340                         }
341                         
342                         //update spotMap
343                         map<int, int> newMap;
344                         int spot = 0;
345                         
346                         for (int i = 0; i < filterString.length(); i++) {
347                                 if (filterString[i] == '1') {
348                                         //add to newMap
349                                         newMap[spot] = spotMap[i];
350                                         spot++;  
351                                 }
352                         }
353                         spotMap = newMap;
354                 }
355
356                 //trim sequences - this follows ccodes remove_extra_gaps 
357                 trimSequences(query);
358                 if (m->control_pressed) {  return 0;  }
359                 
360                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
361                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
362                 windows = findWindows();  
363                 if (m->control_pressed) {  return 0;  }
364
365                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
366                 removeBadReferenceSeqs(closest);
367                 if (m->control_pressed) {  return 0;  }
368                 
369                 //find the averages for each querys references
370                 getAverageRef(closest);  //fills sumRef, averageRef, sumSquaredRef and refCombo.
371                 getAverageQuery(closest, query);  //fills sumQuery, averageQuery, sumSquaredQuery.
372                 if (m->control_pressed) {  return 0;  }                 
373                 
374                 //find the averages for each querys references 
375                 findVarianceRef();  //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
376                 if (m->control_pressed) {  return 0;  } 
377                         
378                 //find the averages for the query 
379                 findVarianceQuery();  //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
380                 if (m->control_pressed) {  return 0;  }
381                                         
382                 determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. 
383                 if (m->control_pressed) {  return 0;  }
384                 
385                 return 0;
386         }
387         catch(exception& e) {
388                 m->errorOut(e, "Ccode", "getChimeras");
389                 exit(1);
390         }
391 }
392 /***************************************************************************************************************/
393 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
394 void Ccode::trimSequences(Sequence* query) {
395         try {
396                 
397                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
398                 int rearPos = query->getAligned().length();
399                 
400                 //********find first position in closest seqs that is a non gap character***********//
401                 //find first position all query seqs that is a non gap character
402                 for (int i = 0; i < closest.size(); i++) {
403                         
404                         string aligned = closest[i].seq->getAligned();
405                         int pos = 0;
406                         
407                         //find first spot in this seq
408                         for (int j = 0; j < aligned.length(); j++) {
409                                 if (isalpha(aligned[j])) {
410                                         pos = j;
411                                         break;
412                                 }
413                         }
414                         
415                         //save this spot if it is the farthest
416                         if (pos > frontPos) { frontPos = pos; }
417                 }
418                 
419                 //find first position all querySeq[query] that is a non gap character
420                 string aligned = query->getAligned();
421                 int pos = 0;
422                         
423                 //find first spot in this seq
424                 for (int j = 0; j < aligned.length(); j++) {
425                         if (isalpha(aligned[j])) {
426                                 pos = j;
427                                 break;
428                         }
429                 }
430                 
431                 //save this spot if it is the farthest
432                 if (pos > frontPos) { frontPos = pos; }
433                 
434                 
435                 //********find last position in closest seqs that is a non gap character***********//
436                 for (int i = 0; i < closest.size(); i++) {
437                         
438                         string aligned = closest[i].seq->getAligned();
439                         int pos = aligned.length();
440                         
441                         //find first spot in this seq
442                         for (int j = aligned.length()-1; j >= 0; j--) {
443                                 if (isalpha(aligned[j])) {
444                                         pos = j;
445                                         break;
446                                 }
447                         }
448                         
449                         //save this spot if it is the farthest
450                         if (pos < rearPos) { rearPos = pos; }
451                 }
452                 
453                 //find last position all querySeqs[query] that is a non gap character
454                 aligned = query->getAligned();
455                 pos = aligned.length();
456                 
457                 //find first spot in this seq
458                 for (int j = aligned.length()-1; j >= 0; j--) {
459                         if (isalpha(aligned[j])) {
460                                 pos = j;
461                                 break;
462                         }
463                 }
464                 
465                 //save this spot if it is the farthest
466                 if (pos < rearPos) { rearPos = pos; }
467
468                 
469                 //check to make sure that is not whole seq
470                 if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1);  }
471                 
472                 map<int, int> tempTrim;
473                 tempTrim[frontPos] = rearPos;
474                 
475                 //save trimmed locations
476                 trim = tempTrim;
477                                                 
478                 //update spotMask
479                 map<int, int> newMap;
480                 int spot = 0;
481                 
482                 for (int i = frontPos; i < rearPos; i++) {
483                         //add to newMap
484                         newMap[spot] = spotMap[i];
485                         spot++;  
486                 }
487                 spotMap = newMap;
488         }
489         catch(exception& e) {
490                 m->errorOut(e, "Ccode", "trimSequences");
491                 exit(1);
492         }
493 }
494 /***************************************************************************************************************/
495 vector<int> Ccode::findWindows() {
496         try {
497                 
498                 vector<int> win; 
499                 it = trim.begin();
500                 
501                 int length = it->second - it->first;
502         
503                 //default is wanted = 10% of total length
504                 if (windowSizes > length) { 
505                         m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
506                         windowSizes = length / 10;
507                 }else if (windowSizes == 0) { windowSizes = length / 10;  }
508                 else if (windowSizes > (length * 0.20)) {
509                         m->mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
510                 }else if (windowSizes < (length * 0.05)) {
511                         m->mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
512                 }
513                 
514                 //save starting points of each window
515                 for (int m = it->first;  m < (it->second-windowSizes); m+=windowSizes) {  win.push_back(m);  }
516                 
517                 //save last window
518                 if (win[win.size()-1] < (it->first+length)) {
519                         win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
520                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
521                 
522                 return win;
523         }
524         catch(exception& e) {
525                 m->errorOut(e, "Ccode", "findWindows");
526                 exit(1);
527         }
528 }
529 //***************************************************************************************************************
530 int Ccode::getDiff(string seqA, string seqB) {
531         try {
532                 
533                 int numDiff = 0;
534                 
535                 for (int i = 0; i < seqA.length(); i++) {
536                         //if you are both not gaps
537                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
538                                 //are you different
539                                 if (seqA[i] != seqB[i]) { 
540                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
541
542                                         /* the char in base_a and base_b have been checked and they are different */
543                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
544                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
545                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
546                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
547                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
548                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
549                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
550                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
551                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
552                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
553                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
554                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
555                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
556                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
557                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
558                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
559                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
560                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
561                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
562                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
563                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
564                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
565                                         else ok = 0;  /* the bases are different and not equivalent */
566                                         
567                                         //check if they are both blanks
568                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
569                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
570                                         
571                                         if (ok == 0) {  numDiff++;  }
572                                 }
573                         //}
574                 }
575                 
576                 return numDiff;
577         
578         }
579         catch(exception& e) {
580                 m->errorOut(e, "Ccode", "getDiff");
581                 exit(1);
582         }
583 }
584 //***************************************************************************************************************
585 //tried to make this look most like ccode original implementation
586 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
587         try {
588                 
589                 vector< vector<int> > numDiffBases;
590                 numDiffBases.resize(seqs.size());
591                 //initialize to 0
592                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
593                 
594                 it = trim.begin();
595                 int length = it->second - it->first;
596                 
597                 //calc differences from each sequence to everyother seq in the set
598                 for (int i = 0; i < seqs.size(); i++) {
599                         
600                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
601                         
602                         //so you don't calc i to j and j to i since they are the same
603                         for (int j = 0; j < i; j++) {
604                                 
605                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
606                                 
607                                 //compare strings
608                                 int numDiff = getDiff(seqA, seqB);
609                                 
610                                 numDiffBases[i][j] = numDiff;
611                                 numDiffBases[j][i] = numDiff;
612                         }
613                 }
614                 
615                 //initailize remove to 0
616                 vector<int> remove;  remove.resize(seqs.size(), 0);
617                 float top = ((20*length) / (float) 100);
618                 float bottom = ((0.5*length) / (float) 100);
619                 
620                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
621                 for (int i = 0; i < numDiffBases.size(); i++) {
622                         for (int j = 0; j < i; j++) {   
623                                 //are you more than 20% different
624                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
625                                 //are you less than 0.5% different
626                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
627                         }
628                 }
629                 
630                 int numSeqsLeft = 0;
631                 
632                 //count seqs that are not going to be removed
633                 for (int i = 0; i < remove.size(); i++) {  
634                         if (remove[i] == 0)  { numSeqsLeft++;  }
635                 }
636                 
637                 //if you have enough then remove bad ones
638                 if (numSeqsLeft >= 3) {
639                         vector<SeqDist> goodSeqs;
640                         //remove bad seqs
641                         for (int i = 0; i < remove.size(); i++) {
642                                 if (remove[i] == 0) { 
643                                         goodSeqs.push_back(seqs[i]);
644                                 }
645                         }
646                         
647                         seqs = goodSeqs;
648                         
649                 }else { //warn, but dont remove any
650                         m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); m->mothurOutEndLine();  
651                 }
652
653         }
654         catch(exception& e) {
655                 m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
656                 exit(1);
657         }
658 }
659 //***************************************************************************************************************
660 //makes copy of templateseq for filter
661 vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
662         try{
663         
664                 vector<SeqDist>  topMatches;  
665                 
666                 Sequence query = *(q);
667                         
668                 //calc distance to each sequence in template seqs
669                 for (int i = 0; i < templateSeqs.size(); i++) {
670                         
671                         Sequence ref = *(templateSeqs[i]); 
672                                         
673                         //find overall dist
674                         distCalc->calcDist(query, ref);
675                         float dist = distCalc->getDist();       
676                                 
677                         //save distance
678                         SeqDist temp;
679                         temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
680                         temp.dist = dist;
681
682                         topMatches.push_back(temp);
683                 }
684                         
685                 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
686                 
687                 for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
688                 
689                 topMatches.resize(numWanted);
690                         
691                 return topMatches;
692
693         }
694         catch(exception& e) {
695                 m->errorOut(e, "Ccode", "findClosestSides");
696                 exit(1);
697         }
698 }
699 /**************************************************************************************************/
700 //find the distances from each reference sequence to every other reference sequence for each window for this query
701 void Ccode::getAverageRef(vector<SeqDist> ref) {
702         try {
703                 
704                 vector< vector< vector<int> > >  diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
705                 
706                 //initialize diffs vector
707                 diffs.resize(ref.size());
708                 for (int i = 0; i < diffs.size(); i++) {  
709                         diffs[i].resize(ref.size());
710                         for (int j = 0; j < diffs[i].size(); j++) {
711                                 diffs[i][j].resize(windows.size(), 0);
712                         }
713                 }
714                 
715                 it = trim.begin();
716                                 
717                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
718                 for (int i = 0; i < ref.size(); i++) {
719                         
720                         string refI = ref[i].seq->getAligned();
721                         
722                         //j<i, so you don't find distances from i to j and then j to i.
723                         for (int j = 0; j < i; j++) {
724                         
725                                 string refJ = ref[j].seq->getAligned();
726                         
727                                 for (int k = 0; k < windows.size(); k++) {
728                                         
729                                         string refIWindowk, refJWindowk;
730                                         
731                                         if (k < windows.size()-1) {
732                                                 //get window strings
733                                                 refIWindowk = refI.substr(windows[k], windowSizes);
734                                                 refJWindowk = refJ.substr(windows[k], windowSizes);
735                                         }else { //last window may be smaller than rest - see findwindows
736                                                 //get window strings
737                                                 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
738                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
739                                         }
740                                         
741                                         //find differences
742                                         int diff = getDiff(refIWindowk, refJWindowk);
743
744                                         //save differences in [i][j][k] and [j][i][k] since they are the same
745                                         diffs[i][j][k] = diff;
746                                         diffs[j][i][k] = diff;
747
748                                 }//k
749                                 
750                         }//j
751                 
752                 }//i
753                 
754                 //initialize sumRef for this query 
755                 sumRef.resize(windows.size(), 0);
756                 sumSquaredRef.resize(windows.size(), 0);
757                 averageRef.resize(windows.size(), 0);
758                 
759                 //find the sum of the differences for hte reference sequences
760                 for (int i = 0; i < diffs.size(); i++) {
761                         for (int j = 0; j < i; j++) {
762                         
763                                 //increment this querys reference sequences combos
764                                 refCombo++;
765                                 
766                                 for (int k = 0; k < diffs[i][j].size(); k++) {
767                                         sumRef[k] += diffs[i][j][k];
768                                         sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
769                                 }//k
770                                 
771                         }//j
772                 }//i
773
774                 
775                 //find the average of the differences for the references for each window
776                 for (int i = 0; i < windows.size(); i++) {
777                         averageRef[i] = sumRef[i] / (float) refCombo;
778                 }
779                 
780         }
781         catch(exception& e) {
782                 m->errorOut(e, "Ccode", "getAverageRef");
783                 exit(1);
784         }
785 }
786 /**************************************************************************************************/
787 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
788         try {
789         
790                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
791                 
792                 //initialize diffs vector
793                 diffs.resize(ref.size());
794                 for (int j = 0; j < diffs.size(); j++) {
795                         diffs[j].resize(windows.size(), 0);
796                 }
797                 
798                 it = trim.begin();
799                                                         
800                 string refQuery = query->getAligned();
801                          
802                 //j<i, so you don't find distances from i to j and then j to i.
803                 for (int j = 0; j < ref.size(); j++) {
804                          
805                          string refJ = ref[j].seq->getAligned();
806                          
807                          for (int k = 0; k < windows.size(); k++) {
808                                         
809                                         string QueryWindowk, refJWindowk;
810                                         
811                                         if (k < windows.size()-1) {
812                                                 //get window strings
813                                                 QueryWindowk = refQuery.substr(windows[k], windowSizes);
814                                                 refJWindowk = refJ.substr(windows[k], windowSizes);                                     
815                                         }else { //last window may be smaller than rest - see findwindows
816                                                 //get window strings
817                                                 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
818                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
819                                         }
820                                         
821                                         //find differences
822                                         int diff = getDiff(QueryWindowk, refJWindowk);
823                                         
824                                         //save differences 
825                                         diffs[j][k] = diff;
826                          
827                          }//k
828                 }//j
829                          
830                 
831                 //initialize sumRef for this query 
832                 sumQuery.resize(windows.size(), 0);
833                 sumSquaredQuery.resize(windows.size(), 0);
834                 averageQuery.resize(windows.size(), 0);
835                 
836                 //find the sum of the differences 
837                 for (int j = 0; j < diffs.size(); j++) {
838                         for (int k = 0; k < diffs[j].size(); k++) {
839                                 sumQuery[k] += diffs[j][k];
840                                 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
841                         }//k
842                 }//j
843                 
844                 
845                 //find the average of the differences for the references for each window
846                 for (int i = 0; i < windows.size(); i++) {
847                         averageQuery[i] = sumQuery[i] / (float) ref.size();
848                 }
849         }
850         catch(exception& e) {
851                 m->errorOut(e, "Ccode", "getAverageQuery");
852                 exit(1);
853         }
854 }
855 /**************************************************************************************************/
856 void Ccode::findVarianceRef() {
857         try {
858                 
859                 varRef.resize(windows.size(), 0);
860                 sdRef.resize(windows.size(), 0);
861                 
862                 //for each window
863                 for (int i = 0; i < windows.size(); i++) {
864                         varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
865                         sdRef[i] = sqrt(varRef[i]);
866                         
867                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
868                         if (averageRef[i] < 0.001)                      {       averageRef[i] = 0.001;          }
869                         if (sumRef[i] < 0.001)                          {       sumRef[i] = 0.001;                      }
870                         if (varRef[i] < 0.001)                          {       varRef[i] = 0.001;                      }
871                         if (sumSquaredRef[i] < 0.001)           {       sumSquaredRef[i] = 0.001;       }
872                         if (sdRef[i] < 0.001)                           {       sdRef[i] = 0.001;                       }
873                                 
874                 }
875         }
876         catch(exception& e) {
877                 m->errorOut(e, "Ccode", "findVarianceRef");
878                 exit(1);
879         }
880 }
881 /**************************************************************************************************/
882 void Ccode::findVarianceQuery() {
883         try {
884                 varQuery.resize(windows.size(), 0);
885                 sdQuery.resize(windows.size(), 0);
886                 
887                 //for each window
888                 for (int i = 0; i < windows.size(); i++) {
889                         varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
890                         sdQuery[i] = sqrt(varQuery[i]);
891                         
892                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
893                         if (averageQuery[i] < 0.001)                    {       averageQuery[i] = 0.001;                }
894                         if (sumQuery[i] < 0.001)                                {       sumQuery[i] = 0.001;                    }
895                         if (varQuery[i] < 0.001)                                {       varQuery[i] = 0.001;                    }
896                         if (sumSquaredQuery[i] < 0.001)         {       sumSquaredQuery[i] = 0.001;     }
897                         if (sdQuery[i] < 0.001)                         {       sdQuery[i] = 0.001;                     }
898                 }
899
900         }
901         catch(exception& e) {
902                 m->errorOut(e, "Ccode", "findVarianceQuery");
903                 exit(1);
904         }
905 }
906 /**************************************************************************************************/
907 void Ccode::determineChimeras() {
908         try {
909                 
910                 isChimericConfidence.resize(windows.size(), false);
911                 isChimericTStudent.resize(windows.size(), false);
912                 isChimericANOVA.resize(windows.size(), false);
913                 anova.resize(windows.size());
914
915                 
916                 //for each window
917                 for (int i = 0; i < windows.size(); i++) {
918                 
919                         //get confidence limits
920                         float t = getT(closest.size()-1);  //how many seqs you are comparing to this querySeq
921                         float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i]; 
922                         float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i]; 
923                 
924                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) {  /* range does not include 1 */
925                                         isChimericConfidence[i] = true;   /* significantly higher at P<0.05 */ 
926
927                         }
928                         
929                         //student t test
930                         int degreeOfFreedom = refCombo + closest.size() - 2;
931                         float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size()));    /* denominator, without sqrt(), for ts calculations */
932                                 
933                         float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT)));  /* value of ts for t-student test */   
934                         t = getT(degreeOfFreedom);
935         
936                         if ((ts >= t) && (averageQuery[i] > averageRef[i])) {   
937                                 isChimericTStudent[i] = true;   /* significantly higher at P<0.05 */ 
938                         }
939                 
940                         //anova test
941                         float value1 = sumQuery[i] + sumRef[i];
942                         float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
943                         float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
944                         float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
945                         float value5 = value2 - value4;
946                         float value6 = value3 - value4;
947                         float value7 = value5 - value6;
948                         float value8 = value7 / ((float) degreeOfFreedom);
949                         float anovaValue = value6 / value8;
950                         
951                         float f = getF(degreeOfFreedom);
952                         
953                          if ((anovaValue >= f) && (averageQuery[i] > averageRef[i]))  {
954                        isChimericANOVA[i] = true;   /* significant P<0.05 */
955                 }
956                         
957                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
958                         
959                         anova[i] = anovaValue;
960                 }
961                 
962         }
963         catch(exception& e) {
964                 m->errorOut(e, "Ccode", "determineChimeras");
965                 exit(1);
966         }
967 }
968 /**************************************************************************************************/    
969 float Ccode::getT(int numseq) {
970         try {
971         
972                 float tvalue = 0;
973                 
974                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
975                 if (numseq > 120) tvalue = 1.645;
976                 else if (numseq > 60) tvalue = 1.658;
977         else if (numseq > 40) tvalue = 1.671;
978         else if (numseq > 30) tvalue = 1.684;
979         else if (numseq > 29) tvalue = 1.697;
980         else if (numseq > 28) tvalue = 1.699;
981         else if (numseq > 27) tvalue = 1.701;
982         else if (numseq > 26) tvalue = 1.703;
983         else if (numseq > 25) tvalue = 1.706;
984         else if (numseq > 24) tvalue = 1.708;
985         else if (numseq > 23) tvalue = 1.711;
986         else if (numseq > 22) tvalue = 1.714;
987         else if (numseq > 21) tvalue = 1.717;
988         else if (numseq > 20) tvalue = 1.721;
989         else if (numseq > 19) tvalue = 1.725;
990         else if (numseq > 18) tvalue = 1.729;
991         else if (numseq > 17) tvalue = 1.734;
992         else if (numseq > 16) tvalue = 1.740;
993         else if (numseq > 15) tvalue = 1.746;
994         else if (numseq > 14) tvalue = 1.753;
995         else if (numseq > 13) tvalue = 1.761;
996         else if (numseq > 12) tvalue = 1.771;
997         else if (numseq > 11) tvalue = 1.782;
998         else if (numseq > 10) tvalue = 1.796;
999         else if (numseq > 9) tvalue = 1.812;
1000         else if (numseq > 8) tvalue = 1.833;
1001         else if (numseq > 7) tvalue = 1.860;
1002         else if (numseq > 6) tvalue = 1.895;
1003         else if (numseq > 5) tvalue = 1.943;
1004         else if (numseq > 4) tvalue = 2.015;
1005         else if (numseq > 3) tvalue = 2.132;
1006         else if (numseq > 2) tvalue = 2.353;
1007         else if (numseq > 1) tvalue = 2.920;
1008                 else if (numseq <= 1) {
1009                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1010                 }
1011                 
1012                 return tvalue;
1013         }
1014         catch(exception& e) {
1015                 m->errorOut(e, "Ccode", "getT");
1016                 exit(1);
1017         }
1018 }
1019 /**************************************************************************************************/    
1020 float Ccode::getF(int numseq) {
1021         try {
1022         
1023                 float fvalue = 0;
1024                 
1025                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
1026         if (numseq > 120) fvalue = 3.84;
1027         else if (numseq > 60) fvalue = 3.92;
1028         else if (numseq > 40) fvalue = 4.00;
1029         else if (numseq > 30) fvalue = 4.08;
1030         else if (numseq > 29) fvalue = 4.17;
1031         else if (numseq > 28) fvalue = 4.18;
1032         else if (numseq > 27) fvalue = 4.20;
1033         else if (numseq > 26) fvalue = 4.21;
1034         else if (numseq > 25) fvalue = 4.23;
1035         else if (numseq > 24) fvalue = 4.24;
1036         else if (numseq > 23) fvalue = 4.26;
1037         else if (numseq > 22) fvalue = 4.28;
1038         else if (numseq > 21) fvalue = 4.30;
1039         else if (numseq > 20) fvalue = 4.32;
1040         else if (numseq > 19) fvalue = 4.35;
1041         else if (numseq > 18) fvalue = 4.38;
1042         else if (numseq > 17) fvalue = 4.41;
1043         else if (numseq > 16) fvalue = 4.45;
1044         else if (numseq > 15) fvalue = 4.49;
1045         else if (numseq > 14) fvalue = 4.54;
1046         else if (numseq > 13) fvalue = 4.60;
1047         else if (numseq > 12) fvalue = 4.67;
1048         else if (numseq > 11) fvalue = 4.75;
1049         else if (numseq > 10) fvalue = 4.84;
1050         else if (numseq > 9) fvalue = 4.96;
1051         else if (numseq > 8) fvalue = 5.12;
1052         else if (numseq > 7) fvalue = 5.32;
1053         else if (numseq > 6) fvalue = 5.59;
1054         else if (numseq > 5) fvalue = 5.99;
1055         else if (numseq > 4) fvalue = 6.61;
1056         else if (numseq > 3) fvalue = 7.71;
1057         else if (numseq > 2) fvalue = 10.1;
1058         else if (numseq > 1) fvalue = 18.5;
1059         else if (numseq > 0) fvalue = 161;
1060                 else if (numseq <= 0) {
1061                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1062         }
1063                 
1064                 return fvalue;
1065         }
1066         catch(exception& e) {
1067                 m->errorOut(e, "Ccode", "getF");
1068                 exit(1);
1069         }
1070 }
1071 //***************************************************************************************************************
1072
1073
1074