]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
changed random forest output filename
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14 //***************************************************************************************************************
15 Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : Chimera() {  
16  try {  
17         
18         fastafile = filename;  
19         outputDir = o; 
20         templateFileName = temp;  templateSeqs = readSeqs(temp); 
21         setMask(mask);
22         filter = f;
23         window = win;
24         numWanted = numW;
25         
26         distCalc = new eachGapDist();
27         decalc = new DeCalculator();
28         
29         mapInfo = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "mapinfo";
30         
31         #ifdef USE_MPI
32                 
33                 //char* inFileName = new char[mapInfo.length()];
34                 //memcpy(inFileName, mapInfo.c_str(), mapInfo.length());
35                 
36                 char inFileName[1024];
37                 strcpy(inFileName, mapInfo.c_str());
38                 
39                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
40
41                 MPI_File_open(MPI_COMM_WORLD, inFileName, outMode, MPI_INFO_NULL, &outMap);  //comm, filename, mode, info, filepointer
42                 
43                 //delete inFileName;
44
45                 int pid;
46                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
47                 
48                 if (pid == 0) {
49                         string outString = "Place in masked, filtered and trimmed sequence\tPlace in original alignment\n";
50                         
51                         MPI_Status status;
52                         int length = outString.length();
53                         char* buf2 = new char[length];
54                         memcpy(buf2, outString.c_str(), length);
55                                 
56                         MPI_File_write_shared(outMap, buf2, length, MPI_CHAR, &status);
57                         delete buf2;
58                 }
59         #else
60
61                 ofstream out2;
62                 m->openOutputFile(mapInfo, out2);
63                 
64                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
65                 out2.close();
66         #endif
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "Ccode", "Ccode");
70                 exit(1);
71         }
72 }
73 //***************************************************************************************************************
74 Ccode::~Ccode() {
75         delete distCalc;
76         delete decalc;
77         
78         #ifdef USE_MPI
79                 MPI_File_close(&outMap);
80         #endif
81 }       
82 //***************************************************************************************************************
83 Sequence Ccode::print(ostream& out, ostream& outAcc) {
84         try {
85                 
86                 ofstream out2;
87                 m->openOutputFileAppend(mapInfo, out2);
88                 
89                 out2 << querySeq->getName() << endl;
90                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
91                         out2 << it->first << '\t' << it->second << endl;
92                 }
93                 out2.close();
94                 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
95                         
96                 for (int j = 0; j < closest.size(); j++) {
97                         out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
98                 }
99                 out << endl << endl;
100                 
101                 //for each window
102                 //window mapping info.
103                 out << "Mapping information: ";
104                 //you mask and did not filter
105                 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
106                                 
107                 //you filtered and did not mask
108                 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
109                                 
110                 //you masked and filtered
111                 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
112                         
113                 out << endl << "Window\tStartPos\tEndPos" << endl;
114                 it = trim.begin();
115                 for (int k = 0; k < windows.size()-1; k++) {
116                         out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
117                 }
118                         
119                 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
120                 out << endl;
121                 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
122                 for (int k = 0; k < windows.size(); k++) {
123                         float ds = averageQuery[k] / averageRef[k]; 
124                         out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
125                 }
126                 out << endl;
127                         
128                 //varRef
129                 //varQuery
130                 /* F test for differences among variances.
131                 * varQuery is expected to be higher or similar than varRef */
132                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
133                         
134                 bool results = false;   
135                                         
136                 //confidence limit, t - Student, anova
137                 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
138                         
139                 for (int k = 0; k < windows.size(); k++) {
140                         string temp = "";
141                         if (isChimericConfidence[k]) {  temp += "*\t"; }
142                         else { temp += "\t"; }
143                                 
144                         if (isChimericTStudent[k]) {  temp += "*\t"; }
145                         else { temp += "\t"; }
146                                 
147                         if (isChimericANOVA[k]) {  temp += "*\t"; }
148                         else { temp += "\t"; }
149                         
150                         out << k+1 << '\t' << temp << endl;
151                                 
152                         if (temp == "*\t*\t*\t") {  results = true;  }
153                 }
154                 out << endl;    
155                         
156                 if (results) {
157                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
158                         outAcc << querySeq->getName() << endl;
159                 }
160
161                 //free memory
162                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
163
164                 return *querySeq;
165         }
166         catch(exception& e) {
167                 m->errorOut(e, "Ccode", "print");
168                 exit(1);
169         }
170 }
171 #ifdef USE_MPI
172 //***************************************************************************************************************
173 Sequence Ccode::print(MPI_File& out, MPI_File& outAcc) {
174         try {
175                 
176                 string outMapString = "";
177                 
178                 outMapString += querySeq->getName() + "\n";
179                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
180                         outMapString += toString(it->first)  + "\t" + toString(it->second)  + "\n";
181                 }
182                 printMapping(outMapString);
183                 outMapString = "";
184                 
185                 string outString = "";
186                 string outAccString = "";
187                 
188                 outString +=  querySeq->getName() + "\n\nReference sequences used and distance to query:\n";
189                         
190                 for (int j = 0; j < closest.size(); j++) {
191                         outString += closest[j].seq->getName() + "\t" + toString(closest[j].dist) + "\n";
192                 }
193                 outString += "\n\nMapping information: ";
194                 
195                 //for each window
196                 //window mapping info.
197                 //you mask and did not filter
198                 if ((seqMask != "") && (!filter)) { outString += "mask and trim."; }
199                                 
200                 //you filtered and did not mask
201                 if ((seqMask == "") && (filter)) { outString += "filter and trim."; }
202                                 
203                 //you masked and filtered
204                 if ((seqMask != "") && (filter)) { outString += "mask, filter and trim."; }
205                         
206                 outString += "\nWindow\tStartPos\tEndPos\n";
207                 it = trim.begin();
208                 for (int k = 0; k < windows.size()-1; k++) {
209                         outString += toString(k+1) + "\t" + toString(spotMap[windows[k]-it->first]) + "\t" + toString(spotMap[windows[k]-it->first+windowSizes]) + "\n";
210                 }
211                         
212                 outString += toString(windows.size()) + "\t" + toString(spotMap[windows[windows.size()-1]-it->first]) + "\t" + toString(spotMap[it->second-it->first-1]) + "\n\n";
213                 
214                 outString += "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova\n";
215                 for (int k = 0; k < windows.size(); k++) {
216                         float ds = averageQuery[k] / averageRef[k]; 
217                         outString += toString(k+1) + "\t" + toString(averageQuery[k]) + "\t" + toString(sdQuery[k]) + "\t" + toString(averageRef[k]) + "\t" + toString(sdRef[k]) + "\t" + toString(ds) + "\t" + toString(anova[k]) + "\n";
218                 }
219                         
220                 //varRef
221                 //varQuery
222                 /* F test for differences among variances.
223                 * varQuery is expected to be higher or similar than varRef */
224                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
225                         
226                 bool results = false;   
227                                         
228                 //confidence limit, t - Student, anova
229                 outString += "\nWindow\tConfidenceLimit\tt-Student\tAnova\n";
230                         
231                 for (int k = 0; k < windows.size(); k++) {
232                         string temp = "";
233                         if (isChimericConfidence[k]) {  temp += "*\t"; }
234                         else { temp += "\t"; }
235                                 
236                         if (isChimericTStudent[k]) {  temp += "*\t"; }
237                         else { temp += "\t"; }
238                                 
239                         if (isChimericANOVA[k]) {  temp += "*\t"; }
240                         else { temp += "\t"; }
241                         
242                         outString += toString(k+1) + "\t" + temp + "\n";
243                                 
244                         if (temp == "*\t*\t*\t") {  results = true;  }
245                 }
246                 outString += "\n";      
247                 
248                 MPI_Status status;
249                 int length = outString.length();
250                 char* buf2 = new char[length];
251                 memcpy(buf2, outString.c_str(), length);
252                                 
253                 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
254                 delete buf2;
255
256                 if (results) {
257                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
258                         outAccString += querySeq->getName() + "\n";
259                         
260                         MPI_Status statusAcc;
261                         length = outAccString.length();
262                         char* buf = new char[length];
263                         memcpy(buf, outAccString.c_str(), length);
264                                 
265                         MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
266                         delete buf;
267                 }
268
269                 //free memory
270                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
271
272                 return *querySeq;
273         }
274         catch(exception& e) {
275                 m->errorOut(e, "Ccode", "print");
276                 exit(1);
277         }
278 }
279 //***************************************************************************************************************
280 int Ccode::printMapping(string& output) {
281         try {
282                         MPI_Status status;
283                         int length = output.length();
284                         char* buf = new char[length];
285                         memcpy(buf, output.c_str(), length);
286                                 
287                         MPI_File_write_shared(outMap, buf, length, MPI_CHAR, &status);
288                         delete buf;
289                         
290                         return 0;
291
292         }
293         catch(exception& e) {
294                 m->errorOut(e, "Ccode", "printMapping");
295                 exit(1);
296         }
297 }
298 #endif
299 //***************************************************************************************************************
300 int Ccode::getChimeras(Sequence* query) {
301         try {
302         
303                 closest.clear();
304                 refCombo = 0;
305                 sumRef.clear(); 
306                 varRef.clear(); 
307                 varQuery.clear(); 
308                 sdRef.clear(); 
309                 sdQuery.clear();     
310                 sumQuery.clear();
311                 sumSquaredRef.clear(); 
312                 sumSquaredQuery.clear(); 
313                 averageRef.clear();
314                 averageQuery.clear();
315                 anova.clear();
316                 isChimericConfidence.clear();
317                 isChimericTStudent.clear();
318                 isChimericANOVA.clear();
319                 trim.clear();
320                 spotMap.clear();
321                 windowSizes = window;
322                 windows.clear();
323
324         
325                 querySeq = query;
326                 
327                 //find closest matches to query
328                 closest = findClosest(query, numWanted);
329                 
330                 if (m->control_pressed) {  return 0;  }
331                 
332                 //initialize spotMap
333                 for (int i = 0; i < query->getAligned().length(); i++) {        spotMap[i] = i;         }
334         
335                 //mask sequences if the user wants to 
336                 if (seqMask != "") {
337                         decalc->setMask(seqMask);
338                         
339                         decalc->runMask(query);
340                         
341                         //mask closest
342                         for (int i = 0; i < closest.size(); i++) {      decalc->runMask(closest[i].seq);        }
343                         
344                         spotMap = decalc->getMaskMap();
345                 }
346                 
347                 if (filter) {
348                         vector<Sequence*> temp;
349                         for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
350                         temp.push_back(query);  
351                         
352                         createFilter(temp, 0.5);
353                 
354                         for (int i = 0; i < temp.size(); i++) { 
355                                 if (m->control_pressed) {  return 0;  }
356                                 runFilter(temp[i]);  
357                         }
358                         
359                         //update spotMap
360                         map<int, int> newMap;
361                         int spot = 0;
362                         
363                         for (int i = 0; i < filterString.length(); i++) {
364                                 if (filterString[i] == '1') {
365                                         //add to newMap
366                                         newMap[spot] = spotMap[i];
367                                         spot++;  
368                                 }
369                         }
370                         spotMap = newMap;
371                 }
372
373                 //trim sequences - this follows ccodes remove_extra_gaps 
374                 trimSequences(query);
375                 if (m->control_pressed) {  return 0;  }
376                 
377                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
378                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
379                 windows = findWindows();  
380                 if (m->control_pressed) {  return 0;  }
381
382                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
383                 removeBadReferenceSeqs(closest);
384                 if (m->control_pressed) {  return 0;  }
385                 
386                 //find the averages for each querys references
387                 getAverageRef(closest);  //fills sumRef, averageRef, sumSquaredRef and refCombo.
388                 getAverageQuery(closest, query);  //fills sumQuery, averageQuery, sumSquaredQuery.
389                 if (m->control_pressed) {  return 0;  }                 
390                 
391                 //find the averages for each querys references 
392                 findVarianceRef();  //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
393                 if (m->control_pressed) {  return 0;  } 
394                         
395                 //find the averages for the query 
396                 findVarianceQuery();  //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
397                 if (m->control_pressed) {  return 0;  }
398                                         
399                 determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. 
400                 if (m->control_pressed) {  return 0;  }
401                 
402                 return 0;
403         }
404         catch(exception& e) {
405                 m->errorOut(e, "Ccode", "getChimeras");
406                 exit(1);
407         }
408 }
409 /***************************************************************************************************************/
410 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
411 void Ccode::trimSequences(Sequence* query) {
412         try {
413                 
414                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
415                 int rearPos = query->getAligned().length();
416                 
417                 //********find first position in closest seqs that is a non gap character***********//
418                 //find first position all query seqs that is a non gap character
419                 for (int i = 0; i < closest.size(); i++) {
420                         
421                         string aligned = closest[i].seq->getAligned();
422                         int pos = 0;
423                         
424                         //find first spot in this seq
425                         for (int j = 0; j < aligned.length(); j++) {
426                                 if (isalpha(aligned[j])) {
427                                         pos = j;
428                                         break;
429                                 }
430                         }
431                         
432                         //save this spot if it is the farthest
433                         if (pos > frontPos) { frontPos = pos; }
434                 }
435                 
436                 //find first position all querySeq[query] that is a non gap character
437                 string aligned = query->getAligned();
438                 int pos = 0;
439                         
440                 //find first spot in this seq
441                 for (int j = 0; j < aligned.length(); j++) {
442                         if (isalpha(aligned[j])) {
443                                 pos = j;
444                                 break;
445                         }
446                 }
447                 
448                 //save this spot if it is the farthest
449                 if (pos > frontPos) { frontPos = pos; }
450                 
451                 
452                 //********find last position in closest seqs that is a non gap character***********//
453                 for (int i = 0; i < closest.size(); i++) {
454                         
455                         string aligned = closest[i].seq->getAligned();
456                         int pos = aligned.length();
457                         
458                         //find first spot in this seq
459                         for (int j = aligned.length()-1; j >= 0; j--) {
460                                 if (isalpha(aligned[j])) {
461                                         pos = j;
462                                         break;
463                                 }
464                         }
465                         
466                         //save this spot if it is the farthest
467                         if (pos < rearPos) { rearPos = pos; }
468                 }
469                 
470                 //find last position all querySeqs[query] that is a non gap character
471                 aligned = query->getAligned();
472                 pos = aligned.length();
473                 
474                 //find first spot in this seq
475                 for (int j = aligned.length()-1; j >= 0; j--) {
476                         if (isalpha(aligned[j])) {
477                                 pos = j;
478                                 break;
479                         }
480                 }
481                 
482                 //save this spot if it is the farthest
483                 if (pos < rearPos) { rearPos = pos; }
484
485                 
486                 //check to make sure that is not whole seq
487                 if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1);  }
488                 
489                 map<int, int> tempTrim;
490                 tempTrim[frontPos] = rearPos;
491                 
492                 //save trimmed locations
493                 trim = tempTrim;
494                                                 
495                 //update spotMask
496                 map<int, int> newMap;
497                 int spot = 0;
498                 
499                 for (int i = frontPos; i < rearPos; i++) {
500                         //add to newMap
501                         newMap[spot] = spotMap[i];
502                         spot++;  
503                 }
504                 spotMap = newMap;
505         }
506         catch(exception& e) {
507                 m->errorOut(e, "Ccode", "trimSequences");
508                 exit(1);
509         }
510 }
511 /***************************************************************************************************************/
512 vector<int> Ccode::findWindows() {
513         try {
514                 
515                 vector<int> win; 
516                 it = trim.begin();
517                 
518                 int length = it->second - it->first;
519         
520                 //default is wanted = 10% of total length
521                 if (windowSizes > length) { 
522                         m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
523                         windowSizes = length / 10;
524                 }else if (windowSizes == 0) { windowSizes = length / 10;  }
525                 else if (windowSizes > (length * 0.20)) {
526                         m->mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
527                 }else if (windowSizes < (length * 0.05)) {
528                         m->mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
529                 }
530                 
531                 //save starting points of each window
532                 for (int m = it->first;  m < (it->second-windowSizes); m+=windowSizes) {  win.push_back(m);  }
533                 
534                 //save last window
535                 if (win[win.size()-1] < (it->first+length)) {
536                         win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
537                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
538                 
539                 return win;
540         }
541         catch(exception& e) {
542                 m->errorOut(e, "Ccode", "findWindows");
543                 exit(1);
544         }
545 }
546 //***************************************************************************************************************
547 int Ccode::getDiff(string seqA, string seqB) {
548         try {
549                 
550                 int numDiff = 0;
551                 
552                 for (int i = 0; i < seqA.length(); i++) {
553                         //if you are both not gaps
554                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
555                                 //are you different
556                                 if (seqA[i] != seqB[i]) { 
557                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
558
559                                         /* the char in base_a and base_b have been checked and they are different */
560                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
561                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
562                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
563                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
564                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
565                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
566                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
567                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
568                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
569                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
570                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
571                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
572                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
573                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
574                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
575                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
576                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
577                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
578                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
579                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
580                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
581                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
582                                         else ok = 0;  /* the bases are different and not equivalent */
583                                         
584                                         //check if they are both blanks
585                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
586                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
587                                         
588                                         if (ok == 0) {  numDiff++;  }
589                                 }
590                         //}
591                 }
592                 
593                 return numDiff;
594         
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "Ccode", "getDiff");
598                 exit(1);
599         }
600 }
601 //***************************************************************************************************************
602 //tried to make this look most like ccode original implementation
603 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
604         try {
605                 
606                 vector< vector<int> > numDiffBases;
607                 numDiffBases.resize(seqs.size());
608                 //initialize to 0
609                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
610                 
611                 it = trim.begin();
612                 int length = it->second - it->first;
613                 
614                 //calc differences from each sequence to everyother seq in the set
615                 for (int i = 0; i < seqs.size(); i++) {
616                         
617                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
618                         
619                         //so you don't calc i to j and j to i since they are the same
620                         for (int j = 0; j < i; j++) {
621                                 
622                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
623                                 
624                                 //compare strings
625                                 int numDiff = getDiff(seqA, seqB);
626                                 
627                                 numDiffBases[i][j] = numDiff;
628                                 numDiffBases[j][i] = numDiff;
629                         }
630                 }
631                 
632                 //initailize remove to 0
633                 vector<int> remove;  remove.resize(seqs.size(), 0);
634                 float top = ((20*length) / (float) 100);
635                 float bottom = ((0.5*length) / (float) 100);
636                 
637                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
638                 for (int i = 0; i < numDiffBases.size(); i++) {
639                         for (int j = 0; j < i; j++) {   
640                                 //are you more than 20% different
641                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
642                                 //are you less than 0.5% different
643                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
644                         }
645                 }
646                 
647                 int numSeqsLeft = 0;
648                 
649                 //count seqs that are not going to be removed
650                 for (int i = 0; i < remove.size(); i++) {  
651                         if (remove[i] == 0)  { numSeqsLeft++;  }
652                 }
653                 
654                 //if you have enough then remove bad ones
655                 if (numSeqsLeft >= 3) {
656                         vector<SeqDist> goodSeqs;
657                         //remove bad seqs
658                         for (int i = 0; i < remove.size(); i++) {
659                                 if (remove[i] == 0) { 
660                                         goodSeqs.push_back(seqs[i]);
661                                 }
662                         }
663                         
664                         seqs = goodSeqs;
665                         
666                 }else { //warn, but dont remove any
667                         m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); m->mothurOutEndLine();  
668                 }
669
670         }
671         catch(exception& e) {
672                 m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
673                 exit(1);
674         }
675 }
676 //***************************************************************************************************************
677 //makes copy of templateseq for filter
678 vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
679         try{
680         
681                 vector<SeqDist>  topMatches;  
682                 
683                 Sequence query = *(q);
684                         
685                 //calc distance to each sequence in template seqs
686                 for (int i = 0; i < templateSeqs.size(); i++) {
687                         
688                         Sequence ref = *(templateSeqs[i]); 
689                                         
690                         //find overall dist
691                         distCalc->calcDist(query, ref);
692                         float dist = distCalc->getDist();       
693                                 
694                         //save distance
695                         SeqDist temp;
696                         temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
697                         temp.dist = dist;
698
699                         topMatches.push_back(temp);
700                 }
701                         
702                 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
703                 
704                 for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
705                 
706                 topMatches.resize(numWanted);
707                         
708                 return topMatches;
709
710         }
711         catch(exception& e) {
712                 m->errorOut(e, "Ccode", "findClosestSides");
713                 exit(1);
714         }
715 }
716 /**************************************************************************************************/
717 //find the distances from each reference sequence to every other reference sequence for each window for this query
718 void Ccode::getAverageRef(vector<SeqDist> ref) {
719         try {
720                 
721                 vector< vector< vector<int> > >  diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
722                 
723                 //initialize diffs vector
724                 diffs.resize(ref.size());
725                 for (int i = 0; i < diffs.size(); i++) {  
726                         diffs[i].resize(ref.size());
727                         for (int j = 0; j < diffs[i].size(); j++) {
728                                 diffs[i][j].resize(windows.size(), 0);
729                         }
730                 }
731                 
732                 it = trim.begin();
733                                 
734                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
735                 for (int i = 0; i < ref.size(); i++) {
736                         
737                         string refI = ref[i].seq->getAligned();
738                         
739                         //j<i, so you don't find distances from i to j and then j to i.
740                         for (int j = 0; j < i; j++) {
741                         
742                                 string refJ = ref[j].seq->getAligned();
743                         
744                                 for (int k = 0; k < windows.size(); k++) {
745                                         
746                                         string refIWindowk, refJWindowk;
747                                         
748                                         if (k < windows.size()-1) {
749                                                 //get window strings
750                                                 refIWindowk = refI.substr(windows[k], windowSizes);
751                                                 refJWindowk = refJ.substr(windows[k], windowSizes);
752                                         }else { //last window may be smaller than rest - see findwindows
753                                                 //get window strings
754                                                 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
755                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
756                                         }
757                                         
758                                         //find differences
759                                         int diff = getDiff(refIWindowk, refJWindowk);
760
761                                         //save differences in [i][j][k] and [j][i][k] since they are the same
762                                         diffs[i][j][k] = diff;
763                                         diffs[j][i][k] = diff;
764
765                                 }//k
766                                 
767                         }//j
768                 
769                 }//i
770                 
771                 //initialize sumRef for this query 
772                 sumRef.resize(windows.size(), 0);
773                 sumSquaredRef.resize(windows.size(), 0);
774                 averageRef.resize(windows.size(), 0);
775                 
776                 //find the sum of the differences for hte reference sequences
777                 for (int i = 0; i < diffs.size(); i++) {
778                         for (int j = 0; j < i; j++) {
779                         
780                                 //increment this querys reference sequences combos
781                                 refCombo++;
782                                 
783                                 for (int k = 0; k < diffs[i][j].size(); k++) {
784                                         sumRef[k] += diffs[i][j][k];
785                                         sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
786                                 }//k
787                                 
788                         }//j
789                 }//i
790
791                 
792                 //find the average of the differences for the references for each window
793                 for (int i = 0; i < windows.size(); i++) {
794                         averageRef[i] = sumRef[i] / (float) refCombo;
795                 }
796                 
797         }
798         catch(exception& e) {
799                 m->errorOut(e, "Ccode", "getAverageRef");
800                 exit(1);
801         }
802 }
803 /**************************************************************************************************/
804 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
805         try {
806         
807                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
808                 
809                 //initialize diffs vector
810                 diffs.resize(ref.size());
811                 for (int j = 0; j < diffs.size(); j++) {
812                         diffs[j].resize(windows.size(), 0);
813                 }
814                 
815                 it = trim.begin();
816                                                         
817                 string refQuery = query->getAligned();
818                          
819                 //j<i, so you don't find distances from i to j and then j to i.
820                 for (int j = 0; j < ref.size(); j++) {
821                          
822                          string refJ = ref[j].seq->getAligned();
823                          
824                          for (int k = 0; k < windows.size(); k++) {
825                                         
826                                         string QueryWindowk, refJWindowk;
827                                         
828                                         if (k < windows.size()-1) {
829                                                 //get window strings
830                                                 QueryWindowk = refQuery.substr(windows[k], windowSizes);
831                                                 refJWindowk = refJ.substr(windows[k], windowSizes);                                     
832                                         }else { //last window may be smaller than rest - see findwindows
833                                                 //get window strings
834                                                 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
835                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
836                                         }
837                                         
838                                         //find differences
839                                         int diff = getDiff(QueryWindowk, refJWindowk);
840                                         
841                                         //save differences 
842                                         diffs[j][k] = diff;
843                          
844                          }//k
845                 }//j
846                          
847                 
848                 //initialize sumRef for this query 
849                 sumQuery.resize(windows.size(), 0);
850                 sumSquaredQuery.resize(windows.size(), 0);
851                 averageQuery.resize(windows.size(), 0);
852                 
853                 //find the sum of the differences 
854                 for (int j = 0; j < diffs.size(); j++) {
855                         for (int k = 0; k < diffs[j].size(); k++) {
856                                 sumQuery[k] += diffs[j][k];
857                                 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
858                         }//k
859                 }//j
860                 
861                 
862                 //find the average of the differences for the references for each window
863                 for (int i = 0; i < windows.size(); i++) {
864                         averageQuery[i] = sumQuery[i] / (float) ref.size();
865                 }
866         }
867         catch(exception& e) {
868                 m->errorOut(e, "Ccode", "getAverageQuery");
869                 exit(1);
870         }
871 }
872 /**************************************************************************************************/
873 void Ccode::findVarianceRef() {
874         try {
875                 
876                 varRef.resize(windows.size(), 0);
877                 sdRef.resize(windows.size(), 0);
878                 
879                 //for each window
880                 for (int i = 0; i < windows.size(); i++) {
881                         varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
882                         sdRef[i] = sqrt(varRef[i]);
883                         
884                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
885                         if (averageRef[i] < 0.001)                      {       averageRef[i] = 0.001;          }
886                         if (sumRef[i] < 0.001)                          {       sumRef[i] = 0.001;                      }
887                         if (varRef[i] < 0.001)                          {       varRef[i] = 0.001;                      }
888                         if (sumSquaredRef[i] < 0.001)           {       sumSquaredRef[i] = 0.001;       }
889                         if (sdRef[i] < 0.001)                           {       sdRef[i] = 0.001;                       }
890                                 
891                 }
892         }
893         catch(exception& e) {
894                 m->errorOut(e, "Ccode", "findVarianceRef");
895                 exit(1);
896         }
897 }
898 /**************************************************************************************************/
899 void Ccode::findVarianceQuery() {
900         try {
901                 varQuery.resize(windows.size(), 0);
902                 sdQuery.resize(windows.size(), 0);
903                 
904                 //for each window
905                 for (int i = 0; i < windows.size(); i++) {
906                         varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
907                         sdQuery[i] = sqrt(varQuery[i]);
908                         
909                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
910                         if (averageQuery[i] < 0.001)                    {       averageQuery[i] = 0.001;                }
911                         if (sumQuery[i] < 0.001)                                {       sumQuery[i] = 0.001;                    }
912                         if (varQuery[i] < 0.001)                                {       varQuery[i] = 0.001;                    }
913                         if (sumSquaredQuery[i] < 0.001)         {       sumSquaredQuery[i] = 0.001;     }
914                         if (sdQuery[i] < 0.001)                         {       sdQuery[i] = 0.001;                     }
915                 }
916
917         }
918         catch(exception& e) {
919                 m->errorOut(e, "Ccode", "findVarianceQuery");
920                 exit(1);
921         }
922 }
923 /**************************************************************************************************/
924 void Ccode::determineChimeras() {
925         try {
926                 
927                 isChimericConfidence.resize(windows.size(), false);
928                 isChimericTStudent.resize(windows.size(), false);
929                 isChimericANOVA.resize(windows.size(), false);
930                 anova.resize(windows.size());
931
932                 
933                 //for each window
934                 for (int i = 0; i < windows.size(); i++) {
935                 
936                         //get confidence limits
937                         float t = getT(closest.size()-1);  //how many seqs you are comparing to this querySeq
938                         float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i]; 
939                         float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i]; 
940                 
941                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) {  /* range does not include 1 */
942                                         isChimericConfidence[i] = true;   /* significantly higher at P<0.05 */ 
943
944                         }
945                         
946                         //student t test
947                         int degreeOfFreedom = refCombo + closest.size() - 2;
948                         float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size()));    /* denominator, without sqrt(), for ts calculations */
949                                 
950                         float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT)));  /* value of ts for t-student test */   
951                         t = getT(degreeOfFreedom);
952         
953                         if ((ts >= t) && (averageQuery[i] > averageRef[i])) {   
954                                 isChimericTStudent[i] = true;   /* significantly higher at P<0.05 */ 
955                         }
956                 
957                         //anova test
958                         float value1 = sumQuery[i] + sumRef[i];
959                         float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
960                         float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
961                         float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
962                         float value5 = value2 - value4;
963                         float value6 = value3 - value4;
964                         float value7 = value5 - value6;
965                         float value8 = value7 / ((float) degreeOfFreedom);
966                         float anovaValue = value6 / value8;
967                         
968                         float f = getF(degreeOfFreedom);
969                         
970                          if ((anovaValue >= f) && (averageQuery[i] > averageRef[i]))  {
971                        isChimericANOVA[i] = true;   /* significant P<0.05 */
972                 }
973                         
974                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
975                         
976                         anova[i] = anovaValue;
977                 }
978                 
979         }
980         catch(exception& e) {
981                 m->errorOut(e, "Ccode", "determineChimeras");
982                 exit(1);
983         }
984 }
985 /**************************************************************************************************/    
986 float Ccode::getT(int numseq) {
987         try {
988         
989                 float tvalue = 0;
990                 
991                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
992                 if (numseq > 120) tvalue = 1.645;
993                 else if (numseq > 60) tvalue = 1.658;
994         else if (numseq > 40) tvalue = 1.671;
995         else if (numseq > 30) tvalue = 1.684;
996         else if (numseq > 29) tvalue = 1.697;
997         else if (numseq > 28) tvalue = 1.699;
998         else if (numseq > 27) tvalue = 1.701;
999         else if (numseq > 26) tvalue = 1.703;
1000         else if (numseq > 25) tvalue = 1.706;
1001         else if (numseq > 24) tvalue = 1.708;
1002         else if (numseq > 23) tvalue = 1.711;
1003         else if (numseq > 22) tvalue = 1.714;
1004         else if (numseq > 21) tvalue = 1.717;
1005         else if (numseq > 20) tvalue = 1.721;
1006         else if (numseq > 19) tvalue = 1.725;
1007         else if (numseq > 18) tvalue = 1.729;
1008         else if (numseq > 17) tvalue = 1.734;
1009         else if (numseq > 16) tvalue = 1.740;
1010         else if (numseq > 15) tvalue = 1.746;
1011         else if (numseq > 14) tvalue = 1.753;
1012         else if (numseq > 13) tvalue = 1.761;
1013         else if (numseq > 12) tvalue = 1.771;
1014         else if (numseq > 11) tvalue = 1.782;
1015         else if (numseq > 10) tvalue = 1.796;
1016         else if (numseq > 9) tvalue = 1.812;
1017         else if (numseq > 8) tvalue = 1.833;
1018         else if (numseq > 7) tvalue = 1.860;
1019         else if (numseq > 6) tvalue = 1.895;
1020         else if (numseq > 5) tvalue = 1.943;
1021         else if (numseq > 4) tvalue = 2.015;
1022         else if (numseq > 3) tvalue = 2.132;
1023         else if (numseq > 2) tvalue = 2.353;
1024         else if (numseq > 1) tvalue = 2.920;
1025                 else if (numseq <= 1) {
1026                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1027                 }
1028                 
1029                 return tvalue;
1030         }
1031         catch(exception& e) {
1032                 m->errorOut(e, "Ccode", "getT");
1033                 exit(1);
1034         }
1035 }
1036 /**************************************************************************************************/    
1037 float Ccode::getF(int numseq) {
1038         try {
1039         
1040                 float fvalue = 0;
1041                 
1042                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
1043         if (numseq > 120) fvalue = 3.84;
1044         else if (numseq > 60) fvalue = 3.92;
1045         else if (numseq > 40) fvalue = 4.00;
1046         else if (numseq > 30) fvalue = 4.08;
1047         else if (numseq > 29) fvalue = 4.17;
1048         else if (numseq > 28) fvalue = 4.18;
1049         else if (numseq > 27) fvalue = 4.20;
1050         else if (numseq > 26) fvalue = 4.21;
1051         else if (numseq > 25) fvalue = 4.23;
1052         else if (numseq > 24) fvalue = 4.24;
1053         else if (numseq > 23) fvalue = 4.26;
1054         else if (numseq > 22) fvalue = 4.28;
1055         else if (numseq > 21) fvalue = 4.30;
1056         else if (numseq > 20) fvalue = 4.32;
1057         else if (numseq > 19) fvalue = 4.35;
1058         else if (numseq > 18) fvalue = 4.38;
1059         else if (numseq > 17) fvalue = 4.41;
1060         else if (numseq > 16) fvalue = 4.45;
1061         else if (numseq > 15) fvalue = 4.49;
1062         else if (numseq > 14) fvalue = 4.54;
1063         else if (numseq > 13) fvalue = 4.60;
1064         else if (numseq > 12) fvalue = 4.67;
1065         else if (numseq > 11) fvalue = 4.75;
1066         else if (numseq > 10) fvalue = 4.84;
1067         else if (numseq > 9) fvalue = 4.96;
1068         else if (numseq > 8) fvalue = 5.12;
1069         else if (numseq > 7) fvalue = 5.32;
1070         else if (numseq > 6) fvalue = 5.59;
1071         else if (numseq > 5) fvalue = 5.99;
1072         else if (numseq > 4) fvalue = 6.61;
1073         else if (numseq > 3) fvalue = 7.71;
1074         else if (numseq > 2) fvalue = 10.1;
1075         else if (numseq > 1) fvalue = 18.5;
1076         else if (numseq > 0) fvalue = 161;
1077                 else if (numseq <= 0) {
1078                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1079         }
1080                 
1081                 return fvalue;
1082         }
1083         catch(exception& e) {
1084                 m->errorOut(e, "Ccode", "getF");
1085                 exit(1);
1086         }
1087 }
1088 //***************************************************************************************************************
1089
1090
1091