5 * Created by westcott on 8/24/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp) { fastafile = filename; templateFile = temp; }
17 //***************************************************************************************************************
21 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
22 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
26 errorOut(e, "Ccode", "~Ccode");
30 //***************************************************************************************************************
31 void Ccode::print(ostream& out) {
36 string mapInfo = getRootName(fastafile) + "mapinfo";
38 openOutputFile(mapInfo, out2);
40 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
42 for (int j = 0; j < querySeqs.size(); j++) {
43 out2 << querySeqs[j]->getName() << endl;
44 for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) {
45 out2 << it->first << '\t' << it->second << endl;
50 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
52 out << "For full window mapping info refer to " << mapInfo << endl << endl;
54 for (int i = 0; i < querySeqs.size(); i++) {
56 out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
58 for (int j = 0; j < closest[i].size(); j++) {
59 out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl;
64 //window mapping info.
65 out << "Mapping information: " << endl;
66 //you mask and did not filter
67 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
69 //you filtered and did not mask
70 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
72 //you masked and filtered
73 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
75 out << "Window\tStartPos\tEndPos" << endl;
78 for (int k = 0; k < windows[i].size()-1; k++) {
79 out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl;
82 out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl;
85 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
86 for (int k = 0; k < windows[i].size(); k++) {
87 float ds = averageQuery[i][k] / averageRef[i][k];
88 out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl;
94 /* F test for differences among variances.
95 * varQuery is expected to be higher or similar than varRef */
96 //float fs = varQuery[query][i] / varRef[query][i]; /* F-Snedecor, test for differences of variances */
100 //confidence limit, t - Student, anova
101 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
103 for (int k = 0; k < windows[i].size(); k++) {
105 if (isChimericConfidence[i][k]) { temp += "*\t"; }
106 else { temp += "\t"; }
108 if (isChimericTStudent[i][k]) { temp += "*\t"; }
109 else { temp += "\t"; }
111 if (isChimericANOVA[i][k]) { temp += "*\t"; }
112 else { temp += "\t"; }
114 out << k+1 << '\t' << temp << endl;
116 if (temp != "\t\t\t") { results = true; }
121 mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
125 catch(exception& e) {
126 errorOut(e, "Ccode", "print");
131 //***************************************************************************************************************
132 void Ccode::getChimeras() {
135 //read in query sequences and subject sequences
136 mothurOut("Reading sequences and template file... "); cout.flush();
137 querySeqs = readSeqs(fastafile);
138 templateSeqs = readSeqs(templateFile);
139 mothurOut("Done."); mothurOutEndLine();
141 int numSeqs = querySeqs.size();
143 closest.resize(numSeqs);
145 refCombo.resize(numSeqs, 0);
146 sumRef.resize(numSeqs);
147 varRef.resize(numSeqs);
148 varQuery.resize(numSeqs);
149 sdRef.resize(numSeqs);
150 sdQuery.resize(numSeqs);
151 sumQuery.resize(numSeqs);
152 sumSquaredRef.resize(numSeqs);
153 sumSquaredQuery.resize(numSeqs);
154 averageRef.resize(numSeqs);
155 averageQuery.resize(numSeqs);
156 anova.resize(numSeqs);
157 isChimericConfidence.resize(numSeqs);
158 isChimericTStudent.resize(numSeqs);
159 isChimericANOVA.resize(numSeqs);
160 trim.resize(numSeqs);
161 spotMap.resize(numSeqs);
162 windowSizes.resize(numSeqs, window);
163 windows.resize(numSeqs);
165 //break up file if needed
166 int linesPerProcess = numSeqs / processors ;
168 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
169 //find breakup of sequences for all times we will Parallelize
170 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
173 for (int i = 0; i < (processors-1); i++) {
174 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
176 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
177 int i = processors - 1;
178 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
181 //find breakup of templatefile for quantiles
182 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
184 for (int i = 0; i < processors; i++) {
185 templateLines.push_back(new linePair());
186 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
187 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
191 lines.push_back(new linePair(0, numSeqs));
192 templateLines.push_back(new linePair(0, templateSeqs.size()));
195 distCalc = new eachGapDist();
196 decalc = new DeCalculator();
199 if (processors == 1) {
200 mothurOut("Finding top matches for sequences... "); cout.flush();
201 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
202 mothurOut("Done."); mothurOutEndLine();
203 }else { createProcessesClosest(); }
206 for (int i = 0; i < closest.size(); i++) {
207 //cout << querySeqs[i]->getName() << ": ";
208 string file = querySeqs[i]->getName() + ".input";
210 openOutputFile(file, o);
212 querySeqs[i]->printSequence(o);
213 for (int j = 0; j < closest[i].size(); j++) {
214 //cout << closest[i][j].seq->getName() << '\t';
215 closest[i][j].seq->printSequence(o);
220 for (int j = 0; j < numSeqs; j++) {
221 for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
226 //mask sequences if the user wants to
229 for (int i = 0; i < querySeqs.size(); i++) {
230 decalc->runMask(querySeqs[i]);
234 for (int i = 0; i < templateSeqs.size(); i++) {
235 decalc->runMask(templateSeqs[i]);
238 for (int i = 0; i < numSeqs; i++) {
239 spotMap[i] = decalc->getMaskMap();
244 vector<Sequence*> temp = templateSeqs;
245 for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); }
249 runFilter(querySeqs);
250 runFilter(templateSeqs);
253 map<int, int> newMap;
257 for (int i = 0; i < filterString.length(); i++) {
258 if (filterString[i] == 1) {
260 newMap[spot] = spotMap[j][i];
265 for (int i = 0; i < numSeqs; i++) {
270 //trim sequences - this follows ccodes remove_extra_gaps
271 for (int i = 0; i < querySeqs.size(); i++) {
275 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
276 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
277 for (int i = 0; i < querySeqs.size(); i++) {
278 windows[i] = findWindows(i);
281 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later - should be paralellized
282 for (int i = 0; i < closest.size(); i++) {
283 removeBadReferenceSeqs(closest[i], i);
287 //find the averages for each querys references - should be paralellized
288 for (int i = 0; i < numSeqs; i++) {
289 getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
292 //find the averages for the query - should be paralellized
293 for (int i = 0; i < numSeqs; i++) {
294 getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
297 //find the averages for each querys references - should be paralellized
298 for (int i = 0; i < numSeqs; i++) {
299 findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
302 //find the averages for the query - should be paralellized
303 for (int i = 0; i < numSeqs; i++) {
304 findVarianceQuery(i); //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
307 for (int i = 0; i < numSeqs; i++) {
308 determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. - should be paralellized
312 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
313 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
316 catch(exception& e) {
317 errorOut(e, "Ccode", "getChimeras");
321 /***************************************************************************************************************/
322 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
323 void Ccode::trimSequences(int query) {
326 int frontPos = 0; //should contain first position in all seqs that is not a gap character
327 int rearPos = querySeqs[query]->getAligned().length();
329 //********find first position in closest seqs that is a non gap character***********//
330 //find first position all query seqs that is a non gap character
331 for (int i = 0; i < closest[query].size(); i++) {
333 string aligned = closest[query][i].seq->getAligned();
336 //find first spot in this seq
337 for (int j = 0; j < aligned.length(); j++) {
338 if (isalpha(aligned[j])) {
344 //save this spot if it is the farthest
345 if (pos > frontPos) { frontPos = pos; }
348 //find first position all querySeq[query] that is a non gap character
349 string aligned = querySeqs[query]->getAligned();
352 //find first spot in this seq
353 for (int j = 0; j < aligned.length(); j++) {
354 if (isalpha(aligned[j])) {
360 //save this spot if it is the farthest
361 if (pos > frontPos) { frontPos = pos; }
364 //********find last position in closest seqs that is a non gap character***********//
365 for (int i = 0; i < closest[query].size(); i++) {
367 string aligned = closest[query][i].seq->getAligned();
368 int pos = aligned.length();
370 //find first spot in this seq
371 for (int j = aligned.length()-1; j >= 0; j--) {
372 if (isalpha(aligned[j])) {
378 //save this spot if it is the farthest
379 if (pos < rearPos) { rearPos = pos; }
382 //find last position all querySeqs[query] that is a non gap character
383 aligned = querySeqs[query]->getAligned();
384 pos = aligned.length();
386 //find first spot in this seq
387 for (int j = aligned.length()-1; j >= 0; j--) {
388 if (isalpha(aligned[j])) {
394 //save this spot if it is the farthest
395 if (pos < rearPos) { rearPos = pos; }
398 //check to make sure that is not whole seq
399 if ((rearPos - frontPos - 1) <= 0) { mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1); }
401 map<int, int> tempTrim;
402 tempTrim[frontPos] = rearPos;
404 //save trimmed locations
405 trim[query] = tempTrim;
408 map<int, int> newMap;
411 for (int i = frontPos; i < rearPos; i++) {
413 //cout << query << '\t' << i << '\t' << spotMap[query][i] << endl;
414 newMap[spot] = spotMap[query][i];
418 //for (it = newMap.begin(); it!=newMap.end(); it++) {
419 //cout << query << '\t' << it->first << '\t' << it->second << endl;
422 spotMap[query] = newMap;
426 catch(exception& e) {
427 errorOut(e, "Ccode", "trimSequences");
432 /***************************************************************************************************************/
433 vector<int> Ccode::findWindows(int query) {
437 it = trim[query].begin();
439 int length = it->second - it->first;
441 //default is wanted = 10% of total length
442 if (windowSizes[query] > length) {
443 mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
444 windowSizes[query] = length / 10;
445 }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10; }
446 else if (windowSizes[query] > (length / 20)) {
447 mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine();
448 }else if (windowSizes[query] < (length / 5)) {
449 mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine();
452 //save starting points of each window
453 for (int m = it->first; m < (it->second-windowSizes[query]); m+=windowSizes[query]) { win.push_back(m); }
456 if (win[win.size()-1] < (it->first+length)) {
457 win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
458 } //with this you would get 1,25,50,75,100
464 catch(exception& e) {
465 errorOut(e, "Ccode", "findWindows");
469 //***************************************************************************************************************
470 int Ccode::getDiff(string seqA, string seqB) {
475 for (int i = 0; i < seqA.length(); i++) {
476 //if you are both not gaps
477 //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
479 if (seqA[i] != seqB[i]) {
480 int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
482 /* the char in base_a and base_b have been checked and they are different */
483 if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
484 else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
485 else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
486 else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
487 else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
488 else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
489 else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
490 else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
491 else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
492 else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
493 else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
494 else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
495 else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
496 else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
497 else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
498 else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
499 else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
500 else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
501 else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
502 else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
503 else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
504 else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
505 else ok = 0; /* the bases are different and not equivalent */
507 //check if they are both blanks
508 if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
509 else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
511 if (ok == 0) { numDiff++; }
519 catch(exception& e) {
520 errorOut(e, "Ccode", "getDiff");
524 //***************************************************************************************************************
525 //tried to make this look most like ccode original implementation
526 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs, int query) {
529 vector< vector<int> > numDiffBases;
530 numDiffBases.resize(seqs.size());
532 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
534 it = trim[query].begin();
535 int length = it->second - it->first;
537 //calc differences from each sequence to everyother seq in the set
538 for (int i = 0; i < seqs.size(); i++) {
540 string seqA = seqs[i].seq->getAligned().substr(it->first, length);
542 //so you don't calc i to j and j to i since they are the same
543 for (int j = 0; j < i; j++) {
545 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
548 int numDiff = getDiff(seqA, seqB);
550 numDiffBases[i][j] = numDiff;
551 numDiffBases[j][i] = numDiff;
555 //initailize remove to 0
556 vector<int> remove; remove.resize(seqs.size(), 0);
557 float top = ((20*length) / (float) 100);
558 float bottom = ((0.5*length) / (float) 100);
560 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
561 for (int i = 0; i < numDiffBases.size(); i++) {
562 for (int j = 0; j < i; j++) {
563 //are you more than 20% different
564 if (numDiffBases[i][j] > top) { remove[j] = 1; }
565 //are you less than 0.5% different
566 if (numDiffBases[i][j] < bottom) { remove[j] = 1; }
572 //count seqs that are not going to be removed
573 for (int i = 0; i < remove.size(); i++) {
574 if (remove[i] == 0) { numSeqsLeft++; }
577 //if you have enough then remove bad ones
578 if (numSeqsLeft >= 3) {
579 vector<SeqDist> goodSeqs;
581 for (int i = 0; i < remove.size(); i++) {
582 if (remove[i] == 0) {
583 goodSeqs.push_back(seqs[i]);
589 }else { //warn, but dont remove any
590 mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine();
595 catch(exception& e) {
596 errorOut(e, "Ccode", "removeBadReferenceSeqs");
600 //***************************************************************************************************************
601 vector< vector<SeqDist> > Ccode::findClosest(int start, int end, int numWanted) {
604 vector< vector<SeqDist> > topMatches; topMatches.resize(querySeqs.size());
606 float smallestOverall, smallestLeft, smallestRight;
607 smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000;
609 //for each sequence in querySeqs - find top matches to use as reference
610 for(int j = start; j < end; j++){
612 Sequence query = *(querySeqs[j]);
614 vector<SeqDist> distances;
616 //calc distance to each sequence in template seqs
617 for (int i = 0; i < templateSeqs.size(); i++) {
619 Sequence ref = *(templateSeqs[i]);
622 distCalc->calcDist(query, ref);
623 float dist = distCalc->getDist();
627 temp.seq = templateSeqs[i];
630 distances.push_back(temp);
633 sort(distances.begin(), distances.end(), compareSeqDist);
635 //save the number of top matches wanted
636 for (int h = 0; h < numWanted; h++) {
637 topMatches[j].push_back(distances[h]);
644 catch(exception& e) {
645 errorOut(e, "Ccode", "findClosestSides");
649 /**************************************************************************************************/
650 //find the distances from each reference sequence to every other reference sequence for each window for this query
651 void Ccode::getAverageRef(vector<SeqDist> ref, int query) {
654 vector< vector< vector<int> > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
656 //initialize diffs vector
657 diffs.resize(ref.size());
658 for (int i = 0; i < diffs.size(); i++) {
659 diffs[i].resize(ref.size());
660 for (int j = 0; j < diffs[i].size(); j++) {
661 diffs[i][j].resize(windows[query].size(), 0);
665 it = trim[query].begin();
667 //find the distances from each reference sequence to every other reference sequence for each window for this query
668 for (int i = 0; i < ref.size(); i++) {
670 string refI = ref[i].seq->getAligned();
672 //j<i, so you don't find distances from i to j and then j to i.
673 for (int j = 0; j < i; j++) {
675 string refJ = ref[j].seq->getAligned();
677 for (int k = 0; k < windows[query].size(); k++) {
679 string refIWindowk, refJWindowk;
681 if (k < windows[query].size()-1) {
683 refIWindowk = refI.substr(windows[query][k], windowSizes[query]);
684 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
685 }else { //last window may be smaller than rest - see findwindows
687 refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k]));
688 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
692 int diff = getDiff(refIWindowk, refJWindowk);
693 //cout << i << '\t' << j << '\t' << k << '\t' << diff << endl;
694 //save differences in [i][j][k] and [j][i][k] since they are the same
695 diffs[i][j][k] = diff;
696 diffs[j][i][k] = diff;
704 //initialize sumRef for this query
705 sumRef[query].resize(windows[query].size(), 0);
706 sumSquaredRef[query].resize(windows[query].size(), 0);
707 averageRef[query].resize(windows[query].size(), 0);
709 //find the sum of the differences for hte reference sequences
710 for (int i = 0; i < diffs.size(); i++) {
711 for (int j = 0; j < i; j++) {
713 //increment this querys reference sequences combos
716 for (int k = 0; k < diffs[i][j].size(); k++) {
717 sumRef[query][k] += diffs[i][j][k];
718 sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]);
725 //find the average of the differences for the references for each window
726 for (int i = 0; i < windows[query].size(); i++) {
727 averageRef[query][i] = sumRef[query][i] / (float) refCombo[query];
731 catch(exception& e) {
732 errorOut(e, "Ccode", "getAverageRef");
736 /**************************************************************************************************/
737 void Ccode::getAverageQuery (vector<SeqDist> ref, int query) {
740 vector< vector<int> > diffs; //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
742 //initialize diffs vector
743 diffs.resize(ref.size());
744 for (int j = 0; j < diffs.size(); j++) {
745 diffs[j].resize(windows[query].size(), 0);
748 it = trim[query].begin();
750 string refQuery = querySeqs[query]->getAligned();
752 //j<i, so you don't find distances from i to j and then j to i.
753 for (int j = 0; j < ref.size(); j++) {
755 string refJ = ref[j].seq->getAligned();
757 for (int k = 0; k < windows[query].size(); k++) {
759 string QueryWindowk, refJWindowk;
761 if (k < windows[query].size()-1) {
763 QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]);
764 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
765 }else { //last window may be smaller than rest - see findwindows
767 QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k]));
768 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
772 int diff = getDiff(QueryWindowk, refJWindowk);
773 //cout << j << '\t' << k << '\t' << diff << endl;
781 //initialize sumRef for this query
782 sumQuery[query].resize(windows[query].size(), 0);
783 sumSquaredQuery[query].resize(windows[query].size(), 0);
784 averageQuery[query].resize(windows[query].size(), 0);
786 //find the sum of the differences
787 for (int j = 0; j < diffs.size(); j++) {
788 for (int k = 0; k < diffs[j].size(); k++) {
789 sumQuery[query][k] += diffs[j][k];
790 sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]);
795 //find the average of the differences for the references for each window
796 for (int i = 0; i < windows[query].size(); i++) {
797 averageQuery[query][i] = sumQuery[query][i] / (float) ref.size();
802 catch(exception& e) {
803 errorOut(e, "Ccode", "getAverageQuery");
807 /**************************************************************************************************/
808 void Ccode::findVarianceRef (int query) {
811 varRef[query].resize(windows[query].size(), 0);
812 sdRef[query].resize(windows[query].size(), 0);
815 for (int i = 0; i < windows[query].size(); i++) {
816 varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1);
817 sdRef[query][i] = sqrt(varRef[query][i]);
819 //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
820 if (averageRef[query][i] < 0.001) { averageRef[query][i] = 0.001; }
821 if (sumRef[query][i] < 0.001) { sumRef[query][i] = 0.001; }
822 if (varRef[query][i] < 0.001) { varRef[query][i] = 0.001; }
823 if (sumSquaredRef[query][i] < 0.001) { sumSquaredRef[query][i] = 0.001; }
824 if (sdRef[query][i] < 0.001) { sdRef[query][i] = 0.001; }
828 catch(exception& e) {
829 errorOut(e, "Ccode", "findVarianceRef");
833 /**************************************************************************************************/
834 void Ccode::findVarianceQuery (int query) {
836 varQuery[query].resize(windows[query].size(), 0);
837 sdQuery[query].resize(windows[query].size(), 0);
840 for (int i = 0; i < windows[query].size(); i++) {
841 varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1);
842 sdQuery[query][i] = sqrt(varQuery[query][i]);
844 //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
845 if (averageQuery[query][i] < 0.001) { averageQuery[query][i] = 0.001; }
846 if (sumQuery[query][i] < 0.001) { sumQuery[query][i] = 0.001; }
847 if (varQuery[query][i] < 0.001) { varQuery[query][i] = 0.001; }
848 if (sumSquaredQuery[query][i] < 0.001) { sumSquaredQuery[query][i] = 0.001; }
849 if (sdQuery[query][i] < 0.001) { sdQuery[query][i] = 0.001; }
853 catch(exception& e) {
854 errorOut(e, "Ccode", "findVarianceQuery");
858 /**************************************************************************************************/
859 void Ccode::determineChimeras (int query) {
862 isChimericConfidence[query].resize(windows[query].size(), false);
863 isChimericTStudent[query].resize(windows[query].size(), false);
864 isChimericANOVA[query].resize(windows[query].size(), false);
865 anova[query].resize(windows[query].size());
869 for (int i = 0; i < windows[query].size(); i++) {
871 //get confidence limits
872 float t = getT(closest[query].size()-1); //how many seqs you are comparing to this querySeq
873 float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i];
874 float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i];
875 //cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl;
876 if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) { /* range does not include 1 */
877 isChimericConfidence[query][i] = true; /* significantly higher at P<0.05 */
878 //cout << i << " made it here" << endl;
882 int degreeOfFreedom = refCombo[query] + closest[query].size() - 2;
883 float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size())); /* denominator, without sqrt(), for ts calculations */
885 float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT))); /* value of ts for t-student test */
886 t = getT(degreeOfFreedom);
887 //cout << i << '\t' << t << '\t' << ts << endl;
888 if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) {
889 isChimericTStudent[query][i] = true; /* significantly higher at P<0.05 */
893 float value1 = sumQuery[query][i] + sumRef[query][i];
894 float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i];
895 float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]);
896 float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) );
897 float value5 = value2 - value4;
898 float value6 = value3 - value4;
899 float value7 = value5 - value6;
900 float value8 = value7 / ((float) degreeOfFreedom);
901 float anovaValue = value6 / value8;
903 float f = getF(degreeOfFreedom);
905 if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i])) {
906 isChimericANOVA[query][i] = true; /* significant P<0.05 */
909 anova[query][i] = anovaValue;
913 catch(exception& e) {
914 errorOut(e, "Ccode", "determineChimeras");
918 /**************************************************************************************************/
919 float Ccode::getT(int numseq) {
924 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
925 if (numseq > 120) tvalue = 1.645;
926 else if (numseq > 60) tvalue = 1.658;
927 else if (numseq > 40) tvalue = 1.671;
928 else if (numseq > 30) tvalue = 1.684;
929 else if (numseq > 29) tvalue = 1.697;
930 else if (numseq > 28) tvalue = 1.699;
931 else if (numseq > 27) tvalue = 1.701;
932 else if (numseq > 26) tvalue = 1.703;
933 else if (numseq > 25) tvalue = 1.706;
934 else if (numseq > 24) tvalue = 1.708;
935 else if (numseq > 23) tvalue = 1.711;
936 else if (numseq > 22) tvalue = 1.714;
937 else if (numseq > 21) tvalue = 1.717;
938 else if (numseq > 20) tvalue = 1.721;
939 else if (numseq > 19) tvalue = 1.725;
940 else if (numseq > 18) tvalue = 1.729;
941 else if (numseq > 17) tvalue = 1.734;
942 else if (numseq > 16) tvalue = 1.740;
943 else if (numseq > 15) tvalue = 1.746;
944 else if (numseq > 14) tvalue = 1.753;
945 else if (numseq > 13) tvalue = 1.761;
946 else if (numseq > 12) tvalue = 1.771;
947 else if (numseq > 11) tvalue = 1.782;
948 else if (numseq > 10) tvalue = 1.796;
949 else if (numseq > 9) tvalue = 1.812;
950 else if (numseq > 8) tvalue = 1.833;
951 else if (numseq > 7) tvalue = 1.860;
952 else if (numseq > 6) tvalue = 1.895;
953 else if (numseq > 5) tvalue = 1.943;
954 else if (numseq > 4) tvalue = 2.015;
955 else if (numseq > 3) tvalue = 2.132;
956 else if (numseq > 2) tvalue = 2.353;
957 else if (numseq > 1) tvalue = 2.920;
958 else if (numseq <= 1) {
959 mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
964 catch(exception& e) {
965 errorOut(e, "Ccode", "getT");
969 /**************************************************************************************************/
970 float Ccode::getF(int numseq) {
975 /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
976 if (numseq > 120) fvalue = 3.84;
977 else if (numseq > 60) fvalue = 3.92;
978 else if (numseq > 40) fvalue = 4.00;
979 else if (numseq > 30) fvalue = 4.08;
980 else if (numseq > 29) fvalue = 4.17;
981 else if (numseq > 28) fvalue = 4.18;
982 else if (numseq > 27) fvalue = 4.20;
983 else if (numseq > 26) fvalue = 4.21;
984 else if (numseq > 25) fvalue = 4.23;
985 else if (numseq > 24) fvalue = 4.24;
986 else if (numseq > 23) fvalue = 4.26;
987 else if (numseq > 22) fvalue = 4.28;
988 else if (numseq > 21) fvalue = 4.30;
989 else if (numseq > 20) fvalue = 4.32;
990 else if (numseq > 19) fvalue = 4.35;
991 else if (numseq > 18) fvalue = 4.38;
992 else if (numseq > 17) fvalue = 4.41;
993 else if (numseq > 16) fvalue = 4.45;
994 else if (numseq > 15) fvalue = 4.49;
995 else if (numseq > 14) fvalue = 4.54;
996 else if (numseq > 13) fvalue = 4.60;
997 else if (numseq > 12) fvalue = 4.67;
998 else if (numseq > 11) fvalue = 4.75;
999 else if (numseq > 10) fvalue = 4.84;
1000 else if (numseq > 9) fvalue = 4.96;
1001 else if (numseq > 8) fvalue = 5.12;
1002 else if (numseq > 7) fvalue = 5.32;
1003 else if (numseq > 6) fvalue = 5.59;
1004 else if (numseq > 5) fvalue = 5.99;
1005 else if (numseq > 4) fvalue = 6.61;
1006 else if (numseq > 3) fvalue = 7.71;
1007 else if (numseq > 2) fvalue = 10.1;
1008 else if (numseq > 1) fvalue = 18.5;
1009 else if (numseq > 0) fvalue = 161;
1010 else if (numseq <= 0) {
1011 mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
1016 catch(exception& e) {
1017 errorOut(e, "Ccode", "getF");
1022 /**************************************************************************************************/
1023 void Ccode::createProcessesClosest() {
1025 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1027 vector<int> processIDS;
1029 //loop through and create all the processes you want
1030 while (process != processors) {
1034 processIDS.push_back(pid);
1036 }else if (pid == 0){
1038 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1039 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
1040 mothurOut("Done finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1042 //write out data to file so parent can read it
1044 string s = toString(getpid()) + ".temp";
1045 openOutputFile(s, out);
1048 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1049 for (int j = 0; j < closest[i].size(); j++) {
1050 closest[i][j].seq->printSequence(out);
1053 out << ">" << endl; //to stop sequence read
1055 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1056 for (int j = 0; j < closest[i].size(); j++) {
1057 out << closest[i][j].dist << '\t';
1065 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1068 //force parent to wait until all the processes are done
1069 for (int i=0;i<processors;i++) {
1070 int temp = processIDS[i];
1074 //get data created by processes
1075 for (int i=0;i<processors;i++) {
1077 string s = toString(processIDS[i]) + ".temp";
1078 openInputFile(s, in);
1080 vector< vector<Sequence*> > tempClosest; tempClosest.resize(querySeqs.size());
1082 for (int k = lines[i]->start; k < lines[i]->end; k++) {
1083 vector<Sequence*> tempVector;
1085 for (int j = 0; j < numWanted; j++) {
1087 Sequence* temp = new Sequence(in);
1090 tempVector.push_back(temp);
1093 tempClosest[k] = tempVector;
1097 in >> junk; gobble(in); // to get ">"
1099 vector< vector<float> > dists; dists.resize(querySeqs.size());
1101 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1102 dists[i].resize(closest[i].size());
1103 for (int j = 0; j < closest[i].size(); j++) {
1109 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1110 for (int j = 0; j < closest[i].size(); j++) {
1111 closest[i][j].seq = tempClosest[i][j];
1112 closest[i][j].dist = dists[i][j];
1122 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
1126 catch(exception& e) {
1127 errorOut(e, "Ccode", "createProcessesClosest");
1132 //***************************************************************************************************************