]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
ccode working - still need to paralellize
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
17 //***************************************************************************************************************
18
19 Ccode::~Ccode() {
20         try {
21                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
22                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
23                 delete distCalc;
24         }
25         catch(exception& e) {
26                 errorOut(e, "Ccode", "~Ccode");
27                 exit(1);
28         }
29 }       
30 //***************************************************************************************************************
31 void Ccode::print(ostream& out) {
32         try {
33                 
34                 mothurOutEndLine();
35                 
36                 string mapInfo = getRootName(fastafile) + "mapinfo";
37                 ofstream out2;
38                 openOutputFile(mapInfo, out2);
39                 
40                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
41                 
42                 for (int j = 0; j < querySeqs.size(); j++) {
43                         out2 << querySeqs[j]->getName() << endl;
44                         for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) {
45                                 out2 << it->first << '\t' << it->second << endl;
46                         }
47                 }
48                 out2.close();
49                 
50                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
51                 
52                 out << "For full window mapping info refer to " << mapInfo << endl << endl;
53                 
54                 for (int i = 0; i < querySeqs.size(); i++) {
55                 
56                         out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
57                         
58                         for (int j = 0; j < closest[i].size(); j++) {
59                                 out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl;
60                         }
61                         out << endl << endl;
62                 
63                         //for each window
64                         //window mapping info.
65                         out << "Mapping information: " << endl;
66                         //you mask and did not filter
67                         if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
68                                 
69                         //you filtered and did not mask
70                         if ((seqMask == "") && (filter)) { out << "filter and trim."; }
71                                 
72                         //you masked and filtered
73                         if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
74                         
75                         out << "Window\tStartPos\tEndPos" << endl;
76                         it = trim[i].begin();
77                         
78                         for (int k = 0; k < windows[i].size()-1; k++) {
79                                 out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl;
80                         }
81                         
82                         out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl;
83                         out << endl;
84                         
85                         out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
86                         for (int k = 0; k < windows[i].size(); k++) {
87                                 float ds = averageQuery[i][k] / averageRef[i][k]; 
88                                 out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl;
89                         }
90                         out << endl;
91                         
92                         //varRef
93                         //varQuery
94                         /* F test for differences among variances.
95                         * varQuery is expected to be higher or similar than varRef */
96                         //float fs = varQuery[query][i] / varRef[query][i];     /* F-Snedecor, test for differences of variances */
97                         
98                         bool results = false;   
99                                                 
100                         //confidence limit, t - Student, anova
101                         out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
102                         
103                         for (int k = 0; k < windows[i].size(); k++) {
104                                 string temp = "";
105                                 if (isChimericConfidence[i][k]) {  temp += "*\t"; }
106                                 else { temp += "\t"; }
107                                 
108                                 if (isChimericTStudent[i][k]) {  temp += "*\t"; }
109                                 else { temp += "\t"; }
110                                 
111                                 if (isChimericANOVA[i][k]) {  temp += "*\t"; }
112                                 else { temp += "\t"; }
113                         
114                                 out << k+1 << '\t' << temp << endl;
115                                 
116                                 if (temp != "\t\t\t") {  results = true;  }
117                         }
118                         out << endl;    
119                         
120                         if (results) {
121                                 mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
122                         }
123                 }
124         }
125         catch(exception& e) {
126                 errorOut(e, "Ccode", "print");
127                 exit(1);
128         }
129 }
130
131 //***************************************************************************************************************
132 void Ccode::getChimeras() {
133         try {
134                 
135                 //read in query sequences and subject sequences
136                 mothurOut("Reading sequences and template file... "); cout.flush();
137                 querySeqs = readSeqs(fastafile);
138                 templateSeqs = readSeqs(templateFile);
139                 mothurOut("Done."); mothurOutEndLine();
140                 
141                 int numSeqs = querySeqs.size();
142                 
143                 closest.resize(numSeqs);
144                 
145                 refCombo.resize(numSeqs, 0);
146                 sumRef.resize(numSeqs); 
147                 varRef.resize(numSeqs); 
148                 varQuery.resize(numSeqs); 
149                 sdRef.resize(numSeqs); 
150                 sdQuery.resize(numSeqs);     
151                 sumQuery.resize(numSeqs);
152                 sumSquaredRef.resize(numSeqs); 
153                 sumSquaredQuery.resize(numSeqs); 
154                 averageRef.resize(numSeqs);
155                 averageQuery.resize(numSeqs);
156                 anova.resize(numSeqs);
157                 isChimericConfidence.resize(numSeqs);
158                 isChimericTStudent.resize(numSeqs);
159                 isChimericANOVA.resize(numSeqs);
160                 trim.resize(numSeqs);
161                 spotMap.resize(numSeqs);
162                 windowSizes.resize(numSeqs, window);
163                 windows.resize(numSeqs);
164                 
165                 //break up file if needed
166                 int linesPerProcess = numSeqs / processors ;
167                 
168                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
169                         //find breakup of sequences for all times we will Parallelize
170                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
171                         else {
172                                 //fill line pairs
173                                 for (int i = 0; i < (processors-1); i++) {                      
174                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
175                                 }
176                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
177                                 int i = processors - 1;
178                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
179                         }
180                         
181                         //find breakup of templatefile for quantiles
182                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
183                         else { 
184                                 for (int i = 0; i < processors; i++) {
185                                         templateLines.push_back(new linePair());
186                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
187                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
188                                 }
189                         }
190                 #else
191                         lines.push_back(new linePair(0, numSeqs));
192                         templateLines.push_back(new linePair(0, templateSeqs.size()));
193                 #endif
194         
195                 distCalc = new eachGapDist();
196                 decalc = new DeCalculator();
197                 
198                 //find closest
199                 if (processors == 1) { 
200                         mothurOut("Finding top matches for sequences... "); cout.flush();
201                         closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
202                         mothurOut("Done."); mothurOutEndLine();
203                 }else {         createProcessesClosest();               }
204
205                 
206 for (int i = 0; i < closest.size(); i++) {
207         //cout << querySeqs[i]->getName() << ": ";
208         string file = querySeqs[i]->getName() + ".input";
209         ofstream o;
210         openOutputFile(file, o);
211         
212         querySeqs[i]->printSequence(o);
213         for (int j = 0; j < closest[i].size(); j++) {
214                 //cout << closest[i][j].seq->getName() << '\t';
215                 closest[i][j].seq->printSequence(o);
216         }
217         //cout << endl;
218         o.close();
219 }       
220                 for (int j = 0; j < numSeqs; j++) {
221                         for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
222                                 spotMap[j][i] = i;
223                         }
224                 }
225                 
226                 //mask sequences if the user wants to 
227                 if (seqMask != "") {
228                         //mask querys
229                         for (int i = 0; i < querySeqs.size(); i++) {
230                                 decalc->runMask(querySeqs[i]);
231                         }
232                 
233                         //mask templates
234                         for (int i = 0; i < templateSeqs.size(); i++) {
235                                 decalc->runMask(templateSeqs[i]);
236                         }
237                         
238                         for (int i = 0; i < numSeqs; i++) {
239                                 spotMap[i] = decalc->getMaskMap();
240                         }
241                 }
242                 
243                 if (filter) {
244                         vector<Sequence*> temp = templateSeqs;
245                         for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
246                         
247                         createFilter(temp);
248                         
249                         runFilter(querySeqs);
250                         runFilter(templateSeqs);
251                         
252                         //update spotMap
253                         map<int, int> newMap;
254                         int spot = 0;
255                         int j = 0;
256                         
257                         for (int i = 0; i < filterString.length(); i++) {
258                                 if (filterString[i] == 1) {
259                                         //add to newMap
260                                         newMap[spot] = spotMap[j][i];
261                                         spot++;  
262                                 }
263                         }
264                         
265                         for (int i = 0; i < numSeqs; i++) {
266                                 spotMap[i] = newMap;
267                         }
268                 }
269
270                 //trim sequences - this follows ccodes remove_extra_gaps 
271                 for (int i = 0; i < querySeqs.size(); i++) {
272                         trimSequences(i);
273                 }
274                 
275                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
276                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
277                 for (int i = 0; i < querySeqs.size(); i++) {
278                         windows[i] = findWindows(i);  
279                 }
280
281                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later - should be paralellized
282                 for (int i = 0; i < closest.size(); i++) {
283                         removeBadReferenceSeqs(closest[i], i);
284                 }
285                 
286         
287                 //find the averages for each querys references - should be paralellized
288                 for (int i = 0; i < numSeqs; i++) {
289                         getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
290                 }
291         
292                 //find the averages for the query - should be paralellized
293                 for (int i = 0; i < numSeqs; i++) {
294                         getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
295                 }
296                 
297                 //find the averages for each querys references - should be paralellized
298                 for (int i = 0; i < numSeqs; i++) {
299                         findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
300                 }
301                 
302                 //find the averages for the query - should be paralellized
303                 for (int i = 0; i < numSeqs; i++) {
304                         findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
305                 }
306
307                 for (int i = 0; i < numSeqs; i++) {
308                         determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. - should be paralellized
309                 }
310
311                 //free memory
312                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
313                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
314                         
315         }
316         catch(exception& e) {
317                 errorOut(e, "Ccode", "getChimeras");
318                 exit(1);
319         }
320 }
321 /***************************************************************************************************************/
322 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
323 void Ccode::trimSequences(int query) {
324         try {
325                 
326                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
327                 int rearPos = querySeqs[query]->getAligned().length();
328                 
329                 //********find first position in closest seqs that is a non gap character***********//
330                 //find first position all query seqs that is a non gap character
331                 for (int i = 0; i < closest[query].size(); i++) {
332                         
333                         string aligned = closest[query][i].seq->getAligned();
334                         int pos = 0;
335                         
336                         //find first spot in this seq
337                         for (int j = 0; j < aligned.length(); j++) {
338                                 if (isalpha(aligned[j])) {
339                                         pos = j;
340                                         break;
341                                 }
342                         }
343                         
344                         //save this spot if it is the farthest
345                         if (pos > frontPos) { frontPos = pos; }
346                 }
347                 
348                 //find first position all querySeq[query] that is a non gap character
349                 string aligned = querySeqs[query]->getAligned();
350                 int pos = 0;
351                         
352                 //find first spot in this seq
353                 for (int j = 0; j < aligned.length(); j++) {
354                         if (isalpha(aligned[j])) {
355                                 pos = j;
356                                 break;
357                         }
358                 }
359                 
360                 //save this spot if it is the farthest
361                 if (pos > frontPos) { frontPos = pos; }
362                 
363                 
364                 //********find last position in closest seqs that is a non gap character***********//
365                 for (int i = 0; i < closest[query].size(); i++) {
366                         
367                         string aligned = closest[query][i].seq->getAligned();
368                         int pos = aligned.length();
369                         
370                         //find first spot in this seq
371                         for (int j = aligned.length()-1; j >= 0; j--) {
372                                 if (isalpha(aligned[j])) {
373                                         pos = j;
374                                         break;
375                                 }
376                         }
377                         
378                         //save this spot if it is the farthest
379                         if (pos < rearPos) { rearPos = pos; }
380                 }
381                 
382                 //find last position all querySeqs[query] that is a non gap character
383                 aligned = querySeqs[query]->getAligned();
384                 pos = aligned.length();
385                 
386                 //find first spot in this seq
387                 for (int j = aligned.length()-1; j >= 0; j--) {
388                         if (isalpha(aligned[j])) {
389                                 pos = j;
390                                 break;
391                         }
392                 }
393                 
394                 //save this spot if it is the farthest
395                 if (pos < rearPos) { rearPos = pos; }
396
397                 
398                 //check to make sure that is not whole seq
399                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
400                 
401                 map<int, int> tempTrim;
402                 tempTrim[frontPos] = rearPos;
403                 
404                 //save trimmed locations
405                 trim[query] = tempTrim;
406                                                 
407                 //update spotMask
408                 map<int, int> newMap;
409                 int spot = 0;
410                 
411                 for (int i = frontPos; i < rearPos; i++) {
412                         //add to newMap
413 //cout << query << '\t' << i << '\t' << spotMap[query][i] << endl;
414                         newMap[spot] = spotMap[query][i];
415                         spot++;  
416                 }
417                 
418                 //for (it = newMap.begin(); it!=newMap.end(); it++) {
419                         //cout << query << '\t' << it->first << '\t' << it->second << endl;
420                 //}
421                 
422                 spotMap[query] = newMap;
423
424                 
425         }
426         catch(exception& e) {
427                 errorOut(e, "Ccode", "trimSequences");
428                 exit(1);
429         }
430
431 }
432 /***************************************************************************************************************/
433 vector<int> Ccode::findWindows(int query) {
434         try {
435                 
436                 vector<int> win; 
437                 it = trim[query].begin();
438                 
439                 int length = it->second - it->first;
440                 
441                 //default is wanted = 10% of total length
442                 if (windowSizes[query] > length) { 
443                         mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
444                         windowSizes[query] = length / 10;
445                 }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10;  }
446                 else if (windowSizes[query] > (length / 20)) {
447                         mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
448                 }else if (windowSizes[query] < (length / 5)) {
449                         mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
450                 }
451                 
452                 //save starting points of each window
453                 for (int m = it->first;  m < (it->second-windowSizes[query]); m+=windowSizes[query]) {  win.push_back(m);  }
454                 
455                 //save last window
456                 if (win[win.size()-1] < (it->first+length)) {
457                         win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
458                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
459                 
460
461                 return win;
462         
463         }
464         catch(exception& e) {
465                 errorOut(e, "Ccode", "findWindows");
466                 exit(1);
467         }
468 }
469 //***************************************************************************************************************
470 int Ccode::getDiff(string seqA, string seqB) {
471         try {
472                 
473                 int numDiff = 0;
474                 
475                 for (int i = 0; i < seqA.length(); i++) {
476                         //if you are both not gaps
477                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
478                                 //are you different
479                                 if (seqA[i] != seqB[i]) { 
480                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
481
482                                         /* the char in base_a and base_b have been checked and they are different */
483                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
484                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
485                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
486                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
487                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
488                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
489                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
490                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
491                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
492                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
493                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
494                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
495                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
496                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
497                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
498                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
499                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
500                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
501                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
502                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
503                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
504                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
505                                         else ok = 0;  /* the bases are different and not equivalent */
506                                         
507                                         //check if they are both blanks
508                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
509                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
510                                         
511                                         if (ok == 0) {  numDiff++;  }
512                                 }
513                         //}
514                 }
515                 
516                 return numDiff;
517         
518         }
519         catch(exception& e) {
520                 errorOut(e, "Ccode", "getDiff");
521                 exit(1);
522         }
523 }
524 //***************************************************************************************************************
525 //tried to make this look most like ccode original implementation
526 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs, int query) {
527         try {
528                 
529                 vector< vector<int> > numDiffBases;
530                 numDiffBases.resize(seqs.size());
531                 //initialize to 0
532                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
533                 
534                 it = trim[query].begin();
535                 int length = it->second - it->first;
536                 
537                 //calc differences from each sequence to everyother seq in the set
538                 for (int i = 0; i < seqs.size(); i++) {
539                         
540                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
541                         
542                         //so you don't calc i to j and j to i since they are the same
543                         for (int j = 0; j < i; j++) {
544                                 
545                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
546                                 
547                                 //compare strings
548                                 int numDiff = getDiff(seqA, seqB);
549                                 
550                                 numDiffBases[i][j] = numDiff;
551                                 numDiffBases[j][i] = numDiff;
552                         }
553                 }
554                 
555                 //initailize remove to 0
556                 vector<int> remove;  remove.resize(seqs.size(), 0);
557                 float top = ((20*length) / (float) 100);
558                 float bottom = ((0.5*length) / (float) 100);
559                 
560                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
561                 for (int i = 0; i < numDiffBases.size(); i++) {
562                         for (int j = 0; j < i; j++) {   
563                                 //are you more than 20% different
564                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
565                                 //are you less than 0.5% different
566                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
567                         }
568                 }
569                 
570                 int numSeqsLeft = 0;
571                 
572                 //count seqs that are not going to be removed
573                 for (int i = 0; i < remove.size(); i++) {  
574                         if (remove[i] == 0)  { numSeqsLeft++;  }
575                 }
576                 
577                 //if you have enough then remove bad ones
578                 if (numSeqsLeft >= 3) {
579                         vector<SeqDist> goodSeqs;
580                         //remove bad seqs
581                         for (int i = 0; i < remove.size(); i++) {
582                                 if (remove[i] == 0) { 
583                                         goodSeqs.push_back(seqs[i]);
584                                 }
585                         }
586                         
587                         seqs = goodSeqs;
588                         
589                 }else { //warn, but dont remove any
590                         mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
591                 }
592                         
593
594         }
595         catch(exception& e) {
596                 errorOut(e, "Ccode", "removeBadReferenceSeqs");
597                 exit(1);
598         }
599 }
600 //***************************************************************************************************************
601 vector< vector<SeqDist> > Ccode::findClosest(int start, int end, int numWanted) {
602         try{
603         
604                 vector< vector<SeqDist> > topMatches;  topMatches.resize(querySeqs.size());
605         
606                 float smallestOverall, smallestLeft, smallestRight;
607                 smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
608                 
609                 //for each sequence in querySeqs - find top matches to use as reference
610                 for(int j = start; j < end; j++){
611                         
612                         Sequence query = *(querySeqs[j]);
613                         
614                         vector<SeqDist> distances;
615                         
616                         //calc distance to each sequence in template seqs
617                         for (int i = 0; i < templateSeqs.size(); i++) {
618                         
619                                 Sequence ref = *(templateSeqs[i]); 
620                                         
621                                 //find overall dist
622                                 distCalc->calcDist(query, ref);
623                                 float dist = distCalc->getDist();       
624                                 
625                                 //save distance
626                                 SeqDist temp;
627                                 temp.seq = templateSeqs[i];
628                                 temp.dist = dist;
629
630                                 distances.push_back(temp);
631                         }
632                         
633                         sort(distances.begin(), distances.end(), compareSeqDist);
634                         
635                         //save the number of top matches wanted
636                         for (int h = 0; h < numWanted; h++) {
637                                 topMatches[j].push_back(distances[h]);
638                         }
639                 }
640                         
641                 return topMatches;
642
643         }
644         catch(exception& e) {
645                 errorOut(e, "Ccode", "findClosestSides");
646                 exit(1);
647         }
648 }
649 /**************************************************************************************************/
650 //find the distances from each reference sequence to every other reference sequence for each window for this query
651 void Ccode::getAverageRef(vector<SeqDist> ref, int query) {
652         try {
653                 
654                 vector< vector< vector<int> > > diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
655                 
656                 //initialize diffs vector
657                 diffs.resize(ref.size());
658                 for (int i = 0; i < diffs.size(); i++) {  
659                         diffs[i].resize(ref.size());
660                         for (int j = 0; j < diffs[i].size(); j++) {
661                                 diffs[i][j].resize(windows[query].size(), 0);
662                         }
663                 }
664                 
665                 it = trim[query].begin();
666                                 
667                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
668                 for (int i = 0; i < ref.size(); i++) {
669                         
670                         string refI = ref[i].seq->getAligned();
671                         
672                         //j<i, so you don't find distances from i to j and then j to i.
673                         for (int j = 0; j < i; j++) {
674                         
675                                 string refJ = ref[j].seq->getAligned();
676                         
677                                 for (int k = 0; k < windows[query].size(); k++) {
678                                         
679                                         string refIWindowk, refJWindowk;
680                                         
681                                         if (k < windows[query].size()-1) {
682                                                 //get window strings
683                                                 refIWindowk = refI.substr(windows[query][k], windowSizes[query]);
684                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
685                                         }else { //last window may be smaller than rest - see findwindows
686                                                 //get window strings
687                                                 refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k]));
688                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
689                                         }
690                                         
691                                         //find differences
692                                         int diff = getDiff(refIWindowk, refJWindowk);
693 //cout <<  i << '\t' << j << '\t' << k << '\t' << diff << endl;
694                                         //save differences in [i][j][k] and [j][i][k] since they are the same
695                                         diffs[i][j][k] = diff;
696                                         diffs[j][i][k] = diff;
697
698                                 }//k
699                                 
700                         }//j
701                 
702                 }//i
703                 
704                 //initialize sumRef for this query 
705                 sumRef[query].resize(windows[query].size(), 0);
706                 sumSquaredRef[query].resize(windows[query].size(), 0);
707                 averageRef[query].resize(windows[query].size(), 0);
708                 
709                 //find the sum of the differences for hte reference sequences
710                 for (int i = 0; i < diffs.size(); i++) {
711                         for (int j = 0; j < i; j++) {
712                         
713                                 //increment this querys reference sequences combos
714                                 refCombo[query]++;
715                                 
716                                 for (int k = 0; k < diffs[i][j].size(); k++) {
717                                         sumRef[query][k] += diffs[i][j][k];
718                                         sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]);
719                                 }//k
720                                 
721                         }//j
722                 }//i
723
724                 
725                 //find the average of the differences for the references for each window
726                 for (int i = 0; i < windows[query].size(); i++) {
727                         averageRef[query][i] = sumRef[query][i] / (float) refCombo[query];
728                 }
729                 
730         }
731         catch(exception& e) {
732                 errorOut(e, "Ccode", "getAverageRef");
733                 exit(1);
734         }
735 }
736 /**************************************************************************************************/
737 void Ccode::getAverageQuery (vector<SeqDist> ref, int query) {
738         try {
739         
740                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
741                 
742                 //initialize diffs vector
743                 diffs.resize(ref.size());
744                 for (int j = 0; j < diffs.size(); j++) {
745                         diffs[j].resize(windows[query].size(), 0);
746                 }
747                 
748                 it = trim[query].begin();
749                                                         
750                 string refQuery = querySeqs[query]->getAligned();
751                          
752                 //j<i, so you don't find distances from i to j and then j to i.
753                 for (int j = 0; j < ref.size(); j++) {
754                          
755                          string refJ = ref[j].seq->getAligned();
756                          
757                          for (int k = 0; k < windows[query].size(); k++) {
758                                         
759                                         string QueryWindowk, refJWindowk;
760                                         
761                                         if (k < windows[query].size()-1) {
762                                                 //get window strings
763                                                 QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]);
764                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);                                       
765                                         }else { //last window may be smaller than rest - see findwindows
766                                                 //get window strings
767                                                 QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k]));
768                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
769                                         }
770                                         
771                                         //find differences
772                                         int diff = getDiff(QueryWindowk, refJWindowk);
773 //cout  << j << '\t' << k << '\t' << diff << endl;                                              
774                                         //save differences 
775                                         diffs[j][k] = diff;
776                          
777                          }//k
778                 }//j
779                          
780                 
781                 //initialize sumRef for this query 
782                 sumQuery[query].resize(windows[query].size(), 0);
783                 sumSquaredQuery[query].resize(windows[query].size(), 0);
784                 averageQuery[query].resize(windows[query].size(), 0);
785                 
786                 //find the sum of the differences 
787                 for (int j = 0; j < diffs.size(); j++) {
788                         for (int k = 0; k < diffs[j].size(); k++) {
789                                 sumQuery[query][k] += diffs[j][k];
790                                 sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]);
791                         }//k
792                 }//j
793                 
794                 
795                 //find the average of the differences for the references for each window
796                 for (int i = 0; i < windows[query].size(); i++) {
797                         averageQuery[query][i] = sumQuery[query][i] / (float) ref.size();
798                 }
799
800         
801         }
802         catch(exception& e) {
803                 errorOut(e, "Ccode", "getAverageQuery");
804                 exit(1);
805         }
806 }
807 /**************************************************************************************************/
808 void Ccode::findVarianceRef (int query) {
809         try {
810                 
811                 varRef[query].resize(windows[query].size(), 0);
812                 sdRef[query].resize(windows[query].size(), 0);
813                 
814                 //for each window
815                 for (int i = 0; i < windows[query].size(); i++) {
816                         varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1);
817                         sdRef[query][i] = sqrt(varRef[query][i]);
818                         
819                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
820                         if (averageRef[query][i] < 0.001)                       {       averageRef[query][i] = 0.001;           }
821                         if (sumRef[query][i] < 0.001)                           {       sumRef[query][i] = 0.001;                       }
822                         if (varRef[query][i] < 0.001)                           {       varRef[query][i] = 0.001;                       }
823                         if (sumSquaredRef[query][i] < 0.001)            {       sumSquaredRef[query][i] = 0.001;        }
824                         if (sdRef[query][i] < 0.001)                            {       sdRef[query][i] = 0.001;                        }
825                                 
826                 }
827         }
828         catch(exception& e) {
829                 errorOut(e, "Ccode", "findVarianceRef");
830                 exit(1);
831         }
832 }
833 /**************************************************************************************************/
834 void Ccode::findVarianceQuery (int query) {
835         try {
836                 varQuery[query].resize(windows[query].size(), 0);
837                 sdQuery[query].resize(windows[query].size(), 0);
838                 
839                 //for each window
840                 for (int i = 0; i < windows[query].size(); i++) {
841                         varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1);
842                         sdQuery[query][i] = sqrt(varQuery[query][i]);
843                         
844                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
845                         if (averageQuery[query][i] < 0.001)                     {       averageQuery[query][i] = 0.001;         }
846                         if (sumQuery[query][i] < 0.001)                         {       sumQuery[query][i] = 0.001;                     }
847                         if (varQuery[query][i] < 0.001)                         {       varQuery[query][i] = 0.001;                     }
848                         if (sumSquaredQuery[query][i] < 0.001)          {       sumSquaredQuery[query][i] = 0.001;      }
849                         if (sdQuery[query][i] < 0.001)                          {       sdQuery[query][i] = 0.001;                      }
850                 }
851
852         }
853         catch(exception& e) {
854                 errorOut(e, "Ccode", "findVarianceQuery");
855                 exit(1);
856         }
857 }
858 /**************************************************************************************************/
859 void Ccode::determineChimeras (int query) {
860         try {
861                 
862                 isChimericConfidence[query].resize(windows[query].size(), false);
863                 isChimericTStudent[query].resize(windows[query].size(), false);
864                 isChimericANOVA[query].resize(windows[query].size(), false);
865                 anova[query].resize(windows[query].size());
866
867                 
868                 //for each window
869                 for (int i = 0; i < windows[query].size(); i++) {
870                 
871                         //get confidence limits
872                         float t = getT(closest[query].size()-1);  //how many seqs you are comparing to this querySeq
873                         float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i]; 
874                         float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i]; 
875 //cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl;                    
876                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) {  /* range does not include 1 */
877                                         isChimericConfidence[query][i] = true;   /* significantly higher at P<0.05 */ 
878 //cout << i << " made it here" << endl;
879                         }
880                         
881                         //student t test
882                         int degreeOfFreedom = refCombo[query] + closest[query].size() - 2;
883                         float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size()));    /* denominator, without sqrt(), for ts calculations */
884                                 
885                         float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT)));  /* value of ts for t-student test */     
886                         t = getT(degreeOfFreedom);
887 //cout << i << '\t' << t << '\t' << ts << endl;                 
888                         if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) {   
889                                 isChimericTStudent[query][i] = true;   /* significantly higher at P<0.05 */ 
890                         }
891                 
892                         //anova test
893                         float value1 = sumQuery[query][i] + sumRef[query][i];
894                         float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i];
895                         float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]);
896                         float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) );
897                         float value5 = value2 - value4;
898                         float value6 = value3 - value4;
899                         float value7 = value5 - value6;
900                         float value8 = value7 / ((float) degreeOfFreedom);
901                         float anovaValue = value6 / value8;
902                         
903                         float f = getF(degreeOfFreedom);
904                         
905                          if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i]))  {
906                        isChimericANOVA[query][i] = true;   /* significant P<0.05 */
907                 }
908                         
909                         anova[query][i] = anovaValue;
910                 }
911                 
912         }
913         catch(exception& e) {
914                 errorOut(e, "Ccode", "determineChimeras");
915                 exit(1);
916         }
917 }
918 /**************************************************************************************************/    
919 float Ccode::getT(int numseq) {
920         try {
921         
922                 float tvalue = 0;
923                 
924                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
925                 if (numseq > 120) tvalue = 1.645;
926                 else if (numseq > 60) tvalue = 1.658;
927         else if (numseq > 40) tvalue = 1.671;
928         else if (numseq > 30) tvalue = 1.684;
929         else if (numseq > 29) tvalue = 1.697;
930         else if (numseq > 28) tvalue = 1.699;
931         else if (numseq > 27) tvalue = 1.701;
932         else if (numseq > 26) tvalue = 1.703;
933         else if (numseq > 25) tvalue = 1.706;
934         else if (numseq > 24) tvalue = 1.708;
935         else if (numseq > 23) tvalue = 1.711;
936         else if (numseq > 22) tvalue = 1.714;
937         else if (numseq > 21) tvalue = 1.717;
938         else if (numseq > 20) tvalue = 1.721;
939         else if (numseq > 19) tvalue = 1.725;
940         else if (numseq > 18) tvalue = 1.729;
941         else if (numseq > 17) tvalue = 1.734;
942         else if (numseq > 16) tvalue = 1.740;
943         else if (numseq > 15) tvalue = 1.746;
944         else if (numseq > 14) tvalue = 1.753;
945         else if (numseq > 13) tvalue = 1.761;
946         else if (numseq > 12) tvalue = 1.771;
947         else if (numseq > 11) tvalue = 1.782;
948         else if (numseq > 10) tvalue = 1.796;
949         else if (numseq > 9) tvalue = 1.812;
950         else if (numseq > 8) tvalue = 1.833;
951         else if (numseq > 7) tvalue = 1.860;
952         else if (numseq > 6) tvalue = 1.895;
953         else if (numseq > 5) tvalue = 1.943;
954         else if (numseq > 4) tvalue = 2.015;
955         else if (numseq > 3) tvalue = 2.132;
956         else if (numseq > 2) tvalue = 2.353;
957         else if (numseq > 1) tvalue = 2.920;
958                 else if (numseq <= 1) {
959                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
960                 }
961                 
962                 return tvalue;
963         }
964         catch(exception& e) {
965                 errorOut(e, "Ccode", "getT");
966                 exit(1);
967         }
968 }
969 /**************************************************************************************************/    
970 float Ccode::getF(int numseq) {
971         try {
972         
973                 float fvalue = 0;
974                 
975                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
976         if (numseq > 120) fvalue = 3.84;
977         else if (numseq > 60) fvalue = 3.92;
978         else if (numseq > 40) fvalue = 4.00;
979         else if (numseq > 30) fvalue = 4.08;
980         else if (numseq > 29) fvalue = 4.17;
981         else if (numseq > 28) fvalue = 4.18;
982         else if (numseq > 27) fvalue = 4.20;
983         else if (numseq > 26) fvalue = 4.21;
984         else if (numseq > 25) fvalue = 4.23;
985         else if (numseq > 24) fvalue = 4.24;
986         else if (numseq > 23) fvalue = 4.26;
987         else if (numseq > 22) fvalue = 4.28;
988         else if (numseq > 21) fvalue = 4.30;
989         else if (numseq > 20) fvalue = 4.32;
990         else if (numseq > 19) fvalue = 4.35;
991         else if (numseq > 18) fvalue = 4.38;
992         else if (numseq > 17) fvalue = 4.41;
993         else if (numseq > 16) fvalue = 4.45;
994         else if (numseq > 15) fvalue = 4.49;
995         else if (numseq > 14) fvalue = 4.54;
996         else if (numseq > 13) fvalue = 4.60;
997         else if (numseq > 12) fvalue = 4.67;
998         else if (numseq > 11) fvalue = 4.75;
999         else if (numseq > 10) fvalue = 4.84;
1000         else if (numseq > 9) fvalue = 4.96;
1001         else if (numseq > 8) fvalue = 5.12;
1002         else if (numseq > 7) fvalue = 5.32;
1003         else if (numseq > 6) fvalue = 5.59;
1004         else if (numseq > 5) fvalue = 5.99;
1005         else if (numseq > 4) fvalue = 6.61;
1006         else if (numseq > 3) fvalue = 7.71;
1007         else if (numseq > 2) fvalue = 10.1;
1008         else if (numseq > 1) fvalue = 18.5;
1009         else if (numseq > 0) fvalue = 161;
1010                 else if (numseq <= 0) {
1011                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
1012         }
1013                 
1014                 return fvalue;
1015         }
1016         catch(exception& e) {
1017                 errorOut(e, "Ccode", "getF");
1018                 exit(1);
1019         }
1020 }
1021
1022 /**************************************************************************************************/
1023 void Ccode::createProcessesClosest() {
1024         try {
1025 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1026                 int process = 0;
1027                 vector<int> processIDS;
1028                 
1029                 //loop through and create all the processes you want
1030                 while (process != processors) {
1031                         int pid = fork();
1032                         
1033                         if (pid > 0) {
1034                                 processIDS.push_back(pid);  
1035                                 process++;
1036                         }else if (pid == 0){
1037                                 
1038                                 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1039                                 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
1040                                 mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1041                                 
1042                                 //write out data to file so parent can read it
1043                                 ofstream out;
1044                                 string s = toString(getpid()) + ".temp";
1045                                 openOutputFile(s, out);
1046                                 
1047                                 //output pairs
1048                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1049                                          for (int j = 0; j < closest[i].size(); j++) {
1050                                                 closest[i][j].seq->printSequence(out);
1051                                          }
1052                                 }
1053                                 out << ">" << endl; //to stop sequence read
1054                                 
1055                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1056                                          for (int j = 0; j < closest[i].size(); j++) {
1057                                                 out << closest[i][j].dist << '\t';
1058                                          }
1059                                          out << endl;
1060                                 }
1061
1062                                 out.close();
1063                                 
1064                                 exit(0);
1065                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1066                 }
1067                 
1068                 //force parent to wait until all the processes are done
1069                 for (int i=0;i<processors;i++) { 
1070                         int temp = processIDS[i];
1071                         wait(&temp);
1072                 }
1073                 
1074                 //get data created by processes
1075                 for (int i=0;i<processors;i++) { 
1076                         ifstream in;
1077                         string s = toString(processIDS[i]) + ".temp";
1078                         openInputFile(s, in);
1079                         
1080                         vector< vector<Sequence*> > tempClosest;  tempClosest.resize(querySeqs.size());
1081                         //get pairs
1082                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1083                                 vector<Sequence*> tempVector;
1084                                 
1085                                 for (int j = 0; j < numWanted; j++) {
1086                                 
1087                                         Sequence* temp = new Sequence(in);
1088                                         gobble(in);
1089                                                 
1090                                         tempVector.push_back(temp);
1091                                 }
1092                                 
1093                                 tempClosest[k] = tempVector;
1094                         }
1095                         
1096                         string junk;
1097                         in >> junk;  gobble(in);  // to get ">"
1098                         
1099                         vector< vector<float> > dists; dists.resize(querySeqs.size());
1100                         
1101                         for (int i = lines[process]->start; i < lines[process]->end; i++) {
1102                                 dists[i].resize(closest[i].size());
1103                                 for (int j = 0; j < closest[i].size(); j++) {
1104                                         in >> dists[i][j];
1105                                 }
1106                                 gobble(in);
1107                         } 
1108                         
1109                         for (int i = lines[process]->start; i < lines[process]->end; i++) {
1110                                 for (int j = 0; j < closest[i].size(); j++) {
1111                                         closest[i][j].seq = tempClosest[i][j];
1112                                         closest[i][j].dist = dists[i][j]; 
1113                                 }
1114                         } 
1115
1116                         in.close();
1117                         remove(s.c_str());
1118                 }
1119                         
1120         
1121 #else
1122                 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
1123 #endif  
1124
1125         }
1126         catch(exception& e) {
1127                 errorOut(e, "Ccode", "createProcessesClosest");
1128                 exit(1);
1129         }
1130 }
1131
1132 //***************************************************************************************************************
1133