]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
working on nmds
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : Chimera() {  
17         fastafile = filename;  
18         outputDir = o; 
19         templateFileName = temp;  templateSeqs = readSeqs(temp);
20         setMask(mask);
21         filter = f;
22         window = win;
23         numWanted = numW;
24         
25         distCalc = new eachGapDist();
26         decalc = new DeCalculator();
27         
28         mapInfo = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "mapinfo";
29         
30         #ifdef USE_MPI
31                 
32                 //char* inFileName = new char[mapInfo.length()];
33                 //memcpy(inFileName, mapInfo.c_str(), mapInfo.length());
34                 
35                 char inFileName[1024];
36                 strcpy(inFileName, mapInfo.c_str());
37                 
38                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
39
40                 MPI_File_open(MPI_COMM_WORLD, inFileName, outMode, MPI_INFO_NULL, &outMap);  //comm, filename, mode, info, filepointer
41                 
42                 //delete inFileName;
43
44                 int pid;
45                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
46                 
47                 if (pid == 0) {
48                         string outString = "Place in masked, filtered and trimmed sequence\tPlace in original alignment\n";
49                         
50                         MPI_Status status;
51                         int length = outString.length();
52                         char* buf2 = new char[length];
53                         memcpy(buf2, outString.c_str(), length);
54                                 
55                         MPI_File_write_shared(outMap, buf2, length, MPI_CHAR, &status);
56                         delete buf2;
57                 }
58         #else
59
60                 ofstream out2;
61                 m->openOutputFile(mapInfo, out2);
62                 
63                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
64                 out2.close();
65         #endif
66 }
67 //***************************************************************************************************************
68 Ccode::~Ccode() {
69         delete distCalc;
70         delete decalc;
71         
72         #ifdef USE_MPI
73                 MPI_File_close(&outMap);
74         #endif
75 }       
76 //***************************************************************************************************************
77 Sequence* Ccode::print(ostream& out, ostream& outAcc) {
78         try {
79                 
80                 ofstream out2;
81                 m->openOutputFileAppend(mapInfo, out2);
82                 
83                 out2 << querySeq->getName() << endl;
84                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
85                         out2 << it->first << '\t' << it->second << endl;
86                 }
87                 out2.close();
88                 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
89                         
90                 for (int j = 0; j < closest.size(); j++) {
91                         out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
92                 }
93                 out << endl << endl;
94                 
95                 //for each window
96                 //window mapping info.
97                 out << "Mapping information: ";
98                 //you mask and did not filter
99                 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
100                                 
101                 //you filtered and did not mask
102                 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
103                                 
104                 //you masked and filtered
105                 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
106                         
107                 out << endl << "Window\tStartPos\tEndPos" << endl;
108                 it = trim.begin();
109                 for (int k = 0; k < windows.size()-1; k++) {
110                         out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
111                 }
112                         
113                 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
114                 out << endl;
115                 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
116                 for (int k = 0; k < windows.size(); k++) {
117                         float ds = averageQuery[k] / averageRef[k]; 
118                         out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
119                 }
120                 out << endl;
121                         
122                 //varRef
123                 //varQuery
124                 /* F test for differences among variances.
125                 * varQuery is expected to be higher or similar than varRef */
126                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
127                         
128                 bool results = false;   
129                                         
130                 //confidence limit, t - Student, anova
131                 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
132                         
133                 for (int k = 0; k < windows.size(); k++) {
134                         string temp = "";
135                         if (isChimericConfidence[k]) {  temp += "*\t"; }
136                         else { temp += "\t"; }
137                                 
138                         if (isChimericTStudent[k]) {  temp += "*\t"; }
139                         else { temp += "\t"; }
140                                 
141                         if (isChimericANOVA[k]) {  temp += "*\t"; }
142                         else { temp += "\t"; }
143                         
144                         out << k+1 << '\t' << temp << endl;
145                                 
146                         if (temp == "*\t*\t*\t") {  results = true;  }
147                 }
148                 out << endl;    
149                         
150                 if (results) {
151                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
152                         outAcc << querySeq->getName() << endl;
153                 }
154
155                 //free memory
156                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
157
158                 return NULL;
159         }
160         catch(exception& e) {
161                 m->errorOut(e, "Ccode", "print");
162                 exit(1);
163         }
164 }
165 #ifdef USE_MPI
166 //***************************************************************************************************************
167 Sequence* Ccode::print(MPI_File& out, MPI_File& outAcc) {
168         try {
169                 
170                 string outMapString = "";
171                 
172                 outMapString += querySeq->getName() + "\n";
173                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
174                         outMapString += toString(it->first)  + "\t" + toString(it->second)  + "\n";
175                 }
176                 printMapping(outMapString);
177                 outMapString = "";
178                 
179                 string outString = "";
180                 string outAccString = "";
181                 
182                 outString +=  querySeq->getName() + "\n\nReference sequences used and distance to query:\n";
183                         
184                 for (int j = 0; j < closest.size(); j++) {
185                         outString += closest[j].seq->getName() + "\t" + toString(closest[j].dist) + "\n";
186                 }
187                 outString += "\n\nMapping information: ";
188                 
189                 //for each window
190                 //window mapping info.
191                 //you mask and did not filter
192                 if ((seqMask != "") && (!filter)) { outString += "mask and trim."; }
193                                 
194                 //you filtered and did not mask
195                 if ((seqMask == "") && (filter)) { outString += "filter and trim."; }
196                                 
197                 //you masked and filtered
198                 if ((seqMask != "") && (filter)) { outString += "mask, filter and trim."; }
199                         
200                 outString += "\nWindow\tStartPos\tEndPos\n";
201                 it = trim.begin();
202                 for (int k = 0; k < windows.size()-1; k++) {
203                         outString += toString(k+1) + "\t" + toString(spotMap[windows[k]-it->first]) + "\t" + toString(spotMap[windows[k]-it->first+windowSizes]) + "\n";
204                 }
205                         
206                 outString += toString(windows.size()) + "\t" + toString(spotMap[windows[windows.size()-1]-it->first]) + "\t" + toString(spotMap[it->second-it->first-1]) + "\n\n";
207                 
208                 outString += "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova\n";
209                 for (int k = 0; k < windows.size(); k++) {
210                         float ds = averageQuery[k] / averageRef[k]; 
211                         outString += toString(k+1) + "\t" + toString(averageQuery[k]) + "\t" + toString(sdQuery[k]) + "\t" + toString(averageRef[k]) + "\t" + toString(sdRef[k]) + "\t" + toString(ds) + "\t" + toString(anova[k]) + "\n";
212                 }
213                         
214                 //varRef
215                 //varQuery
216                 /* F test for differences among variances.
217                 * varQuery is expected to be higher or similar than varRef */
218                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
219                         
220                 bool results = false;   
221                                         
222                 //confidence limit, t - Student, anova
223                 outString += "\nWindow\tConfidenceLimit\tt-Student\tAnova\n";
224                         
225                 for (int k = 0; k < windows.size(); k++) {
226                         string temp = "";
227                         if (isChimericConfidence[k]) {  temp += "*\t"; }
228                         else { temp += "\t"; }
229                                 
230                         if (isChimericTStudent[k]) {  temp += "*\t"; }
231                         else { temp += "\t"; }
232                                 
233                         if (isChimericANOVA[k]) {  temp += "*\t"; }
234                         else { temp += "\t"; }
235                         
236                         outString += toString(k+1) + "\t" + temp + "\n";
237                                 
238                         if (temp == "*\t*\t*\t") {  results = true;  }
239                 }
240                 outString += "\n";      
241                 
242                 MPI_Status status;
243                 int length = outString.length();
244                 char* buf2 = new char[length];
245                 memcpy(buf2, outString.c_str(), length);
246                                 
247                 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
248                 delete buf2;
249
250                 if (results) {
251                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
252                         outAccString += querySeq->getName() + "\n";
253                         
254                         MPI_Status statusAcc;
255                         length = outAccString.length();
256                         char* buf = new char[length];
257                         memcpy(buf, outAccString.c_str(), length);
258                                 
259                         MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
260                         delete buf;
261                 }
262
263                 //free memory
264                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
265
266                 return NULL;
267         }
268         catch(exception& e) {
269                 m->errorOut(e, "Ccode", "print");
270                 exit(1);
271         }
272 }
273 //***************************************************************************************************************
274 int Ccode::printMapping(string& output) {
275         try {
276                         MPI_Status status;
277                         int length = output.length();
278                         char* buf = new char[length];
279                         memcpy(buf, output.c_str(), length);
280                                 
281                         MPI_File_write_shared(outMap, buf, length, MPI_CHAR, &status);
282                         delete buf;
283                         
284                         return 0;
285
286         }
287         catch(exception& e) {
288                 m->errorOut(e, "Ccode", "printMapping");
289                 exit(1);
290         }
291 }
292 #endif
293 //***************************************************************************************************************
294 int Ccode::getChimeras(Sequence* query) {
295         try {
296         
297                 closest.clear();
298                 refCombo = 0;
299                 sumRef.clear(); 
300                 varRef.clear(); 
301                 varQuery.clear(); 
302                 sdRef.clear(); 
303                 sdQuery.clear();     
304                 sumQuery.clear();
305                 sumSquaredRef.clear(); 
306                 sumSquaredQuery.clear(); 
307                 averageRef.clear();
308                 averageQuery.clear();
309                 anova.clear();
310                 isChimericConfidence.clear();
311                 isChimericTStudent.clear();
312                 isChimericANOVA.clear();
313                 trim.clear();
314                 spotMap.clear();
315                 windowSizes = window;
316                 windows.clear();
317
318         
319                 querySeq = query;
320                 
321                 //find closest matches to query
322                 closest = findClosest(query, numWanted);
323                 
324                 if (m->control_pressed) {  return 0;  }
325                 
326                 //initialize spotMap
327                 for (int i = 0; i < query->getAligned().length(); i++) {        spotMap[i] = i;         }
328         
329                 //mask sequences if the user wants to 
330                 if (seqMask != "") {
331                         decalc->setMask(seqMask);
332                         
333                         decalc->runMask(query);
334                         
335                         //mask closest
336                         for (int i = 0; i < closest.size(); i++) {      decalc->runMask(closest[i].seq);        }
337                         
338                         spotMap = decalc->getMaskMap();
339                 }
340                 
341                 if (filter) {
342                         vector<Sequence*> temp;
343                         for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
344                         temp.push_back(query);  
345                         
346                         createFilter(temp, 0.5);
347                 
348                         for (int i = 0; i < temp.size(); i++) { 
349                                 if (m->control_pressed) {  return 0;  }
350                                 runFilter(temp[i]);  
351                         }
352                         
353                         //update spotMap
354                         map<int, int> newMap;
355                         int spot = 0;
356                         
357                         for (int i = 0; i < filterString.length(); i++) {
358                                 if (filterString[i] == '1') {
359                                         //add to newMap
360                                         newMap[spot] = spotMap[i];
361                                         spot++;  
362                                 }
363                         }
364                         spotMap = newMap;
365                 }
366
367                 //trim sequences - this follows ccodes remove_extra_gaps 
368                 trimSequences(query);
369                 if (m->control_pressed) {  return 0;  }
370                 
371                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
372                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
373                 windows = findWindows();  
374                 if (m->control_pressed) {  return 0;  }
375
376                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
377                 removeBadReferenceSeqs(closest);
378                 if (m->control_pressed) {  return 0;  }
379                 
380                 //find the averages for each querys references
381                 getAverageRef(closest);  //fills sumRef, averageRef, sumSquaredRef and refCombo.
382                 getAverageQuery(closest, query);  //fills sumQuery, averageQuery, sumSquaredQuery.
383                 if (m->control_pressed) {  return 0;  }                 
384                 
385                 //find the averages for each querys references 
386                 findVarianceRef();  //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
387                 if (m->control_pressed) {  return 0;  } 
388                         
389                 //find the averages for the query 
390                 findVarianceQuery();  //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
391                 if (m->control_pressed) {  return 0;  }
392                                         
393                 determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. 
394                 if (m->control_pressed) {  return 0;  }
395                 
396                 return 0;
397         }
398         catch(exception& e) {
399                 m->errorOut(e, "Ccode", "getChimeras");
400                 exit(1);
401         }
402 }
403 /***************************************************************************************************************/
404 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
405 void Ccode::trimSequences(Sequence* query) {
406         try {
407                 
408                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
409                 int rearPos = query->getAligned().length();
410                 
411                 //********find first position in closest seqs that is a non gap character***********//
412                 //find first position all query seqs that is a non gap character
413                 for (int i = 0; i < closest.size(); i++) {
414                         
415                         string aligned = closest[i].seq->getAligned();
416                         int pos = 0;
417                         
418                         //find first spot in this seq
419                         for (int j = 0; j < aligned.length(); j++) {
420                                 if (isalpha(aligned[j])) {
421                                         pos = j;
422                                         break;
423                                 }
424                         }
425                         
426                         //save this spot if it is the farthest
427                         if (pos > frontPos) { frontPos = pos; }
428                 }
429                 
430                 //find first position all querySeq[query] that is a non gap character
431                 string aligned = query->getAligned();
432                 int pos = 0;
433                         
434                 //find first spot in this seq
435                 for (int j = 0; j < aligned.length(); j++) {
436                         if (isalpha(aligned[j])) {
437                                 pos = j;
438                                 break;
439                         }
440                 }
441                 
442                 //save this spot if it is the farthest
443                 if (pos > frontPos) { frontPos = pos; }
444                 
445                 
446                 //********find last position in closest seqs that is a non gap character***********//
447                 for (int i = 0; i < closest.size(); i++) {
448                         
449                         string aligned = closest[i].seq->getAligned();
450                         int pos = aligned.length();
451                         
452                         //find first spot in this seq
453                         for (int j = aligned.length()-1; j >= 0; j--) {
454                                 if (isalpha(aligned[j])) {
455                                         pos = j;
456                                         break;
457                                 }
458                         }
459                         
460                         //save this spot if it is the farthest
461                         if (pos < rearPos) { rearPos = pos; }
462                 }
463                 
464                 //find last position all querySeqs[query] that is a non gap character
465                 aligned = query->getAligned();
466                 pos = aligned.length();
467                 
468                 //find first spot in this seq
469                 for (int j = aligned.length()-1; j >= 0; j--) {
470                         if (isalpha(aligned[j])) {
471                                 pos = j;
472                                 break;
473                         }
474                 }
475                 
476                 //save this spot if it is the farthest
477                 if (pos < rearPos) { rearPos = pos; }
478
479                 
480                 //check to make sure that is not whole seq
481                 if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1);  }
482                 
483                 map<int, int> tempTrim;
484                 tempTrim[frontPos] = rearPos;
485                 
486                 //save trimmed locations
487                 trim = tempTrim;
488                                                 
489                 //update spotMask
490                 map<int, int> newMap;
491                 int spot = 0;
492                 
493                 for (int i = frontPos; i < rearPos; i++) {
494                         //add to newMap
495                         newMap[spot] = spotMap[i];
496                         spot++;  
497                 }
498                 spotMap = newMap;
499         }
500         catch(exception& e) {
501                 m->errorOut(e, "Ccode", "trimSequences");
502                 exit(1);
503         }
504 }
505 /***************************************************************************************************************/
506 vector<int> Ccode::findWindows() {
507         try {
508                 
509                 vector<int> win; 
510                 it = trim.begin();
511                 
512                 int length = it->second - it->first;
513         
514                 //default is wanted = 10% of total length
515                 if (windowSizes > length) { 
516                         m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
517                         windowSizes = length / 10;
518                 }else if (windowSizes == 0) { windowSizes = length / 10;  }
519                 else if (windowSizes > (length * 0.20)) {
520                         m->mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
521                 }else if (windowSizes < (length * 0.05)) {
522                         m->mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
523                 }
524                 
525                 //save starting points of each window
526                 for (int m = it->first;  m < (it->second-windowSizes); m+=windowSizes) {  win.push_back(m);  }
527                 
528                 //save last window
529                 if (win[win.size()-1] < (it->first+length)) {
530                         win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
531                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
532                 
533                 return win;
534         }
535         catch(exception& e) {
536                 m->errorOut(e, "Ccode", "findWindows");
537                 exit(1);
538         }
539 }
540 //***************************************************************************************************************
541 int Ccode::getDiff(string seqA, string seqB) {
542         try {
543                 
544                 int numDiff = 0;
545                 
546                 for (int i = 0; i < seqA.length(); i++) {
547                         //if you are both not gaps
548                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
549                                 //are you different
550                                 if (seqA[i] != seqB[i]) { 
551                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
552
553                                         /* the char in base_a and base_b have been checked and they are different */
554                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
555                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
556                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
557                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
558                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
559                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
560                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
561                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
562                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
563                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
564                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
565                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
566                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
567                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
568                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
569                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
570                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
571                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
572                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
573                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
574                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
575                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
576                                         else ok = 0;  /* the bases are different and not equivalent */
577                                         
578                                         //check if they are both blanks
579                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
580                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
581                                         
582                                         if (ok == 0) {  numDiff++;  }
583                                 }
584                         //}
585                 }
586                 
587                 return numDiff;
588         
589         }
590         catch(exception& e) {
591                 m->errorOut(e, "Ccode", "getDiff");
592                 exit(1);
593         }
594 }
595 //***************************************************************************************************************
596 //tried to make this look most like ccode original implementation
597 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
598         try {
599                 
600                 vector< vector<int> > numDiffBases;
601                 numDiffBases.resize(seqs.size());
602                 //initialize to 0
603                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
604                 
605                 it = trim.begin();
606                 int length = it->second - it->first;
607                 
608                 //calc differences from each sequence to everyother seq in the set
609                 for (int i = 0; i < seqs.size(); i++) {
610                         
611                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
612                         
613                         //so you don't calc i to j and j to i since they are the same
614                         for (int j = 0; j < i; j++) {
615                                 
616                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
617                                 
618                                 //compare strings
619                                 int numDiff = getDiff(seqA, seqB);
620                                 
621                                 numDiffBases[i][j] = numDiff;
622                                 numDiffBases[j][i] = numDiff;
623                         }
624                 }
625                 
626                 //initailize remove to 0
627                 vector<int> remove;  remove.resize(seqs.size(), 0);
628                 float top = ((20*length) / (float) 100);
629                 float bottom = ((0.5*length) / (float) 100);
630                 
631                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
632                 for (int i = 0; i < numDiffBases.size(); i++) {
633                         for (int j = 0; j < i; j++) {   
634                                 //are you more than 20% different
635                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
636                                 //are you less than 0.5% different
637                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
638                         }
639                 }
640                 
641                 int numSeqsLeft = 0;
642                 
643                 //count seqs that are not going to be removed
644                 for (int i = 0; i < remove.size(); i++) {  
645                         if (remove[i] == 0)  { numSeqsLeft++;  }
646                 }
647                 
648                 //if you have enough then remove bad ones
649                 if (numSeqsLeft >= 3) {
650                         vector<SeqDist> goodSeqs;
651                         //remove bad seqs
652                         for (int i = 0; i < remove.size(); i++) {
653                                 if (remove[i] == 0) { 
654                                         goodSeqs.push_back(seqs[i]);
655                                 }
656                         }
657                         
658                         seqs = goodSeqs;
659                         
660                 }else { //warn, but dont remove any
661                         m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); m->mothurOutEndLine();  
662                 }
663
664         }
665         catch(exception& e) {
666                 m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
667                 exit(1);
668         }
669 }
670 //***************************************************************************************************************
671 //makes copy of templateseq for filter
672 vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
673         try{
674         
675                 vector<SeqDist>  topMatches;  
676                 
677                 Sequence query = *(q);
678                         
679                 //calc distance to each sequence in template seqs
680                 for (int i = 0; i < templateSeqs.size(); i++) {
681                         
682                         Sequence ref = *(templateSeqs[i]); 
683                                         
684                         //find overall dist
685                         distCalc->calcDist(query, ref);
686                         float dist = distCalc->getDist();       
687                                 
688                         //save distance
689                         SeqDist temp;
690                         temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
691                         temp.dist = dist;
692
693                         topMatches.push_back(temp);
694                 }
695                         
696                 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
697                 
698                 for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
699                 
700                 topMatches.resize(numWanted);
701                         
702                 return topMatches;
703
704         }
705         catch(exception& e) {
706                 m->errorOut(e, "Ccode", "findClosestSides");
707                 exit(1);
708         }
709 }
710 /**************************************************************************************************/
711 //find the distances from each reference sequence to every other reference sequence for each window for this query
712 void Ccode::getAverageRef(vector<SeqDist> ref) {
713         try {
714                 
715                 vector< vector< vector<int> > >  diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
716                 
717                 //initialize diffs vector
718                 diffs.resize(ref.size());
719                 for (int i = 0; i < diffs.size(); i++) {  
720                         diffs[i].resize(ref.size());
721                         for (int j = 0; j < diffs[i].size(); j++) {
722                                 diffs[i][j].resize(windows.size(), 0);
723                         }
724                 }
725                 
726                 it = trim.begin();
727                                 
728                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
729                 for (int i = 0; i < ref.size(); i++) {
730                         
731                         string refI = ref[i].seq->getAligned();
732                         
733                         //j<i, so you don't find distances from i to j and then j to i.
734                         for (int j = 0; j < i; j++) {
735                         
736                                 string refJ = ref[j].seq->getAligned();
737                         
738                                 for (int k = 0; k < windows.size(); k++) {
739                                         
740                                         string refIWindowk, refJWindowk;
741                                         
742                                         if (k < windows.size()-1) {
743                                                 //get window strings
744                                                 refIWindowk = refI.substr(windows[k], windowSizes);
745                                                 refJWindowk = refJ.substr(windows[k], windowSizes);
746                                         }else { //last window may be smaller than rest - see findwindows
747                                                 //get window strings
748                                                 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
749                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
750                                         }
751                                         
752                                         //find differences
753                                         int diff = getDiff(refIWindowk, refJWindowk);
754
755                                         //save differences in [i][j][k] and [j][i][k] since they are the same
756                                         diffs[i][j][k] = diff;
757                                         diffs[j][i][k] = diff;
758
759                                 }//k
760                                 
761                         }//j
762                 
763                 }//i
764                 
765                 //initialize sumRef for this query 
766                 sumRef.resize(windows.size(), 0);
767                 sumSquaredRef.resize(windows.size(), 0);
768                 averageRef.resize(windows.size(), 0);
769                 
770                 //find the sum of the differences for hte reference sequences
771                 for (int i = 0; i < diffs.size(); i++) {
772                         for (int j = 0; j < i; j++) {
773                         
774                                 //increment this querys reference sequences combos
775                                 refCombo++;
776                                 
777                                 for (int k = 0; k < diffs[i][j].size(); k++) {
778                                         sumRef[k] += diffs[i][j][k];
779                                         sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
780                                 }//k
781                                 
782                         }//j
783                 }//i
784
785                 
786                 //find the average of the differences for the references for each window
787                 for (int i = 0; i < windows.size(); i++) {
788                         averageRef[i] = sumRef[i] / (float) refCombo;
789                 }
790                 
791         }
792         catch(exception& e) {
793                 m->errorOut(e, "Ccode", "getAverageRef");
794                 exit(1);
795         }
796 }
797 /**************************************************************************************************/
798 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
799         try {
800         
801                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
802                 
803                 //initialize diffs vector
804                 diffs.resize(ref.size());
805                 for (int j = 0; j < diffs.size(); j++) {
806                         diffs[j].resize(windows.size(), 0);
807                 }
808                 
809                 it = trim.begin();
810                                                         
811                 string refQuery = query->getAligned();
812                          
813                 //j<i, so you don't find distances from i to j and then j to i.
814                 for (int j = 0; j < ref.size(); j++) {
815                          
816                          string refJ = ref[j].seq->getAligned();
817                          
818                          for (int k = 0; k < windows.size(); k++) {
819                                         
820                                         string QueryWindowk, refJWindowk;
821                                         
822                                         if (k < windows.size()-1) {
823                                                 //get window strings
824                                                 QueryWindowk = refQuery.substr(windows[k], windowSizes);
825                                                 refJWindowk = refJ.substr(windows[k], windowSizes);                                     
826                                         }else { //last window may be smaller than rest - see findwindows
827                                                 //get window strings
828                                                 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
829                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
830                                         }
831                                         
832                                         //find differences
833                                         int diff = getDiff(QueryWindowk, refJWindowk);
834                                         
835                                         //save differences 
836                                         diffs[j][k] = diff;
837                          
838                          }//k
839                 }//j
840                          
841                 
842                 //initialize sumRef for this query 
843                 sumQuery.resize(windows.size(), 0);
844                 sumSquaredQuery.resize(windows.size(), 0);
845                 averageQuery.resize(windows.size(), 0);
846                 
847                 //find the sum of the differences 
848                 for (int j = 0; j < diffs.size(); j++) {
849                         for (int k = 0; k < diffs[j].size(); k++) {
850                                 sumQuery[k] += diffs[j][k];
851                                 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
852                         }//k
853                 }//j
854                 
855                 
856                 //find the average of the differences for the references for each window
857                 for (int i = 0; i < windows.size(); i++) {
858                         averageQuery[i] = sumQuery[i] / (float) ref.size();
859                 }
860         }
861         catch(exception& e) {
862                 m->errorOut(e, "Ccode", "getAverageQuery");
863                 exit(1);
864         }
865 }
866 /**************************************************************************************************/
867 void Ccode::findVarianceRef() {
868         try {
869                 
870                 varRef.resize(windows.size(), 0);
871                 sdRef.resize(windows.size(), 0);
872                 
873                 //for each window
874                 for (int i = 0; i < windows.size(); i++) {
875                         varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
876                         sdRef[i] = sqrt(varRef[i]);
877                         
878                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
879                         if (averageRef[i] < 0.001)                      {       averageRef[i] = 0.001;          }
880                         if (sumRef[i] < 0.001)                          {       sumRef[i] = 0.001;                      }
881                         if (varRef[i] < 0.001)                          {       varRef[i] = 0.001;                      }
882                         if (sumSquaredRef[i] < 0.001)           {       sumSquaredRef[i] = 0.001;       }
883                         if (sdRef[i] < 0.001)                           {       sdRef[i] = 0.001;                       }
884                                 
885                 }
886         }
887         catch(exception& e) {
888                 m->errorOut(e, "Ccode", "findVarianceRef");
889                 exit(1);
890         }
891 }
892 /**************************************************************************************************/
893 void Ccode::findVarianceQuery() {
894         try {
895                 varQuery.resize(windows.size(), 0);
896                 sdQuery.resize(windows.size(), 0);
897                 
898                 //for each window
899                 for (int i = 0; i < windows.size(); i++) {
900                         varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
901                         sdQuery[i] = sqrt(varQuery[i]);
902                         
903                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
904                         if (averageQuery[i] < 0.001)                    {       averageQuery[i] = 0.001;                }
905                         if (sumQuery[i] < 0.001)                                {       sumQuery[i] = 0.001;                    }
906                         if (varQuery[i] < 0.001)                                {       varQuery[i] = 0.001;                    }
907                         if (sumSquaredQuery[i] < 0.001)         {       sumSquaredQuery[i] = 0.001;     }
908                         if (sdQuery[i] < 0.001)                         {       sdQuery[i] = 0.001;                     }
909                 }
910
911         }
912         catch(exception& e) {
913                 m->errorOut(e, "Ccode", "findVarianceQuery");
914                 exit(1);
915         }
916 }
917 /**************************************************************************************************/
918 void Ccode::determineChimeras() {
919         try {
920                 
921                 isChimericConfidence.resize(windows.size(), false);
922                 isChimericTStudent.resize(windows.size(), false);
923                 isChimericANOVA.resize(windows.size(), false);
924                 anova.resize(windows.size());
925
926                 
927                 //for each window
928                 for (int i = 0; i < windows.size(); i++) {
929                 
930                         //get confidence limits
931                         float t = getT(closest.size()-1);  //how many seqs you are comparing to this querySeq
932                         float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i]; 
933                         float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i]; 
934                 
935                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) {  /* range does not include 1 */
936                                         isChimericConfidence[i] = true;   /* significantly higher at P<0.05 */ 
937
938                         }
939                         
940                         //student t test
941                         int degreeOfFreedom = refCombo + closest.size() - 2;
942                         float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size()));    /* denominator, without sqrt(), for ts calculations */
943                                 
944                         float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT)));  /* value of ts for t-student test */   
945                         t = getT(degreeOfFreedom);
946         
947                         if ((ts >= t) && (averageQuery[i] > averageRef[i])) {   
948                                 isChimericTStudent[i] = true;   /* significantly higher at P<0.05 */ 
949                         }
950                 
951                         //anova test
952                         float value1 = sumQuery[i] + sumRef[i];
953                         float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
954                         float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
955                         float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
956                         float value5 = value2 - value4;
957                         float value6 = value3 - value4;
958                         float value7 = value5 - value6;
959                         float value8 = value7 / ((float) degreeOfFreedom);
960                         float anovaValue = value6 / value8;
961                         
962                         float f = getF(degreeOfFreedom);
963                         
964                          if ((anovaValue >= f) && (averageQuery[i] > averageRef[i]))  {
965                        isChimericANOVA[i] = true;   /* significant P<0.05 */
966                 }
967                         
968                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
969                         
970                         anova[i] = anovaValue;
971                 }
972                 
973         }
974         catch(exception& e) {
975                 m->errorOut(e, "Ccode", "determineChimeras");
976                 exit(1);
977         }
978 }
979 /**************************************************************************************************/    
980 float Ccode::getT(int numseq) {
981         try {
982         
983                 float tvalue = 0;
984                 
985                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
986                 if (numseq > 120) tvalue = 1.645;
987                 else if (numseq > 60) tvalue = 1.658;
988         else if (numseq > 40) tvalue = 1.671;
989         else if (numseq > 30) tvalue = 1.684;
990         else if (numseq > 29) tvalue = 1.697;
991         else if (numseq > 28) tvalue = 1.699;
992         else if (numseq > 27) tvalue = 1.701;
993         else if (numseq > 26) tvalue = 1.703;
994         else if (numseq > 25) tvalue = 1.706;
995         else if (numseq > 24) tvalue = 1.708;
996         else if (numseq > 23) tvalue = 1.711;
997         else if (numseq > 22) tvalue = 1.714;
998         else if (numseq > 21) tvalue = 1.717;
999         else if (numseq > 20) tvalue = 1.721;
1000         else if (numseq > 19) tvalue = 1.725;
1001         else if (numseq > 18) tvalue = 1.729;
1002         else if (numseq > 17) tvalue = 1.734;
1003         else if (numseq > 16) tvalue = 1.740;
1004         else if (numseq > 15) tvalue = 1.746;
1005         else if (numseq > 14) tvalue = 1.753;
1006         else if (numseq > 13) tvalue = 1.761;
1007         else if (numseq > 12) tvalue = 1.771;
1008         else if (numseq > 11) tvalue = 1.782;
1009         else if (numseq > 10) tvalue = 1.796;
1010         else if (numseq > 9) tvalue = 1.812;
1011         else if (numseq > 8) tvalue = 1.833;
1012         else if (numseq > 7) tvalue = 1.860;
1013         else if (numseq > 6) tvalue = 1.895;
1014         else if (numseq > 5) tvalue = 1.943;
1015         else if (numseq > 4) tvalue = 2.015;
1016         else if (numseq > 3) tvalue = 2.132;
1017         else if (numseq > 2) tvalue = 2.353;
1018         else if (numseq > 1) tvalue = 2.920;
1019                 else if (numseq <= 1) {
1020                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1021                 }
1022                 
1023                 return tvalue;
1024         }
1025         catch(exception& e) {
1026                 m->errorOut(e, "Ccode", "getT");
1027                 exit(1);
1028         }
1029 }
1030 /**************************************************************************************************/    
1031 float Ccode::getF(int numseq) {
1032         try {
1033         
1034                 float fvalue = 0;
1035                 
1036                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
1037         if (numseq > 120) fvalue = 3.84;
1038         else if (numseq > 60) fvalue = 3.92;
1039         else if (numseq > 40) fvalue = 4.00;
1040         else if (numseq > 30) fvalue = 4.08;
1041         else if (numseq > 29) fvalue = 4.17;
1042         else if (numseq > 28) fvalue = 4.18;
1043         else if (numseq > 27) fvalue = 4.20;
1044         else if (numseq > 26) fvalue = 4.21;
1045         else if (numseq > 25) fvalue = 4.23;
1046         else if (numseq > 24) fvalue = 4.24;
1047         else if (numseq > 23) fvalue = 4.26;
1048         else if (numseq > 22) fvalue = 4.28;
1049         else if (numseq > 21) fvalue = 4.30;
1050         else if (numseq > 20) fvalue = 4.32;
1051         else if (numseq > 19) fvalue = 4.35;
1052         else if (numseq > 18) fvalue = 4.38;
1053         else if (numseq > 17) fvalue = 4.41;
1054         else if (numseq > 16) fvalue = 4.45;
1055         else if (numseq > 15) fvalue = 4.49;
1056         else if (numseq > 14) fvalue = 4.54;
1057         else if (numseq > 13) fvalue = 4.60;
1058         else if (numseq > 12) fvalue = 4.67;
1059         else if (numseq > 11) fvalue = 4.75;
1060         else if (numseq > 10) fvalue = 4.84;
1061         else if (numseq > 9) fvalue = 4.96;
1062         else if (numseq > 8) fvalue = 5.12;
1063         else if (numseq > 7) fvalue = 5.32;
1064         else if (numseq > 6) fvalue = 5.59;
1065         else if (numseq > 5) fvalue = 5.99;
1066         else if (numseq > 4) fvalue = 6.61;
1067         else if (numseq > 3) fvalue = 7.71;
1068         else if (numseq > 2) fvalue = 10.1;
1069         else if (numseq > 1) fvalue = 18.5;
1070         else if (numseq > 0) fvalue = 161;
1071                 else if (numseq <= 0) {
1072                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1073         }
1074                 
1075                 return fvalue;
1076         }
1077         catch(exception& e) {
1078                 m->errorOut(e, "Ccode", "getF");
1079                 exit(1);
1080         }
1081 }
1082 //***************************************************************************************************************
1083
1084
1085