5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
13 //***************************************************************************************************************
15 Pintail::Pintail(string filename, string temp) { fastafile = filename; templateFile = temp; }
16 //***************************************************************************************************************
20 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
21 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
24 errorOut(e, "Pintail", "~Pintail");
28 //***************************************************************************************************************
29 void Pintail::print(ostream& out) {
32 for (int i = 0; i < querySeqs.size(); i++) {
34 int index = ceil(deviation[i]);
36 //is your DE value higher than the 95%
38 if (DE[i] > quantiles[index][4]) { chimera = "Yes"; }
39 else { chimera = "No"; }
41 out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
42 if (chimera == "Yes") {
43 mothurOut(querySeqs[i]->getName() + "\tdiv: " + toString(deviation[i]) + "\tstDev: " + toString(DE[i]) + "\tchimera flag: " + chimera); mothurOutEndLine();
47 for (int j = 0; j < obsDistance[i].size(); j++) { out << obsDistance[i][j] << '\t'; }
52 for (int m = 0; m < expectedDistance[i].size(); m++) { out << expectedDistance[i][m] << '\t'; }
58 errorOut(e, "Pintail", "print");
63 //***************************************************************************************************************
64 void Pintail::getChimeras() {
67 //read in query sequences and subject sequences
68 mothurOut("Reading sequences and template file... "); cout.flush();
69 querySeqs = readSeqs(fastafile);
70 templateSeqs = readSeqs(templateFile);
71 mothurOut("Done."); mothurOutEndLine();
73 int numSeqs = querySeqs.size();
75 obsDistance.resize(numSeqs);
76 expectedDistance.resize(numSeqs);
77 seqCoef.resize(numSeqs);
80 bestfit.resize(numSeqs);
81 deviation.resize(numSeqs);
82 trimmed.resize(numSeqs);
83 windowSizes.resize(numSeqs, window);
84 windowSizesTemplate.resize(templateSeqs.size(), window);
85 windowsForeachQuery.resize(numSeqs);
87 quantiles.resize(100); //one for every percent mismatch
89 //break up file if needed
90 int linesPerProcess = numSeqs / processors ;
92 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
93 //find breakup of sequences for all times we will Parallelize
94 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
97 for (int i = 0; i < (processors-1); i++) {
98 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
100 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
101 int i = processors - 1;
102 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
105 //find breakup of templatefile for quantiles
106 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
108 for (int i = 0; i < processors; i++) {
109 templateLines.push_back(new linePair());
110 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
111 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
115 lines.push_back(new linePair(0, numSeqs));
116 templateLines.push_back(new linePair(0, templateSeqs.size()));
119 distcalculator = new ignoreGaps();
120 decalc = new DeCalculator();
122 decalc->setMask(seqMask);
125 for (int i = 0; i < querySeqs.size(); i++) {
126 decalc->runMask(querySeqs[i]);
130 for (int i = 0; i < templateSeqs.size(); i++) {
131 decalc->runMask(templateSeqs[i]);
134 for (int i = 0; i < lines.size(); i++) { cout << "line pair " << i << " = " << lines[i]->start << '\t' << lines[i]->end << endl; }
136 if (processors == 1) {
137 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
138 bestfit = findPairs(lines[0]->start, lines[0]->end);
140 //ex.align matches from wigeon
141 for (int m = 0; m < templateSeqs.size(); m++) {
142 if (templateSeqs[m]->getName() == "159481") { bestfit[17] = *(templateSeqs[m]); }
143 if (templateSeqs[m]->getName() == "100137") { bestfit[16] = *(templateSeqs[m]); }
144 if (templateSeqs[m]->getName() == "112956") { bestfit[15] = *(templateSeqs[m]); }
145 if (templateSeqs[m]->getName() == "102326") { bestfit[14] = *(templateSeqs[m]); }
146 if (templateSeqs[m]->getName() == "66229") { bestfit[13] = *(templateSeqs[m]); }
147 if (templateSeqs[m]->getName() == "206276") { bestfit[12] = *(templateSeqs[m]); }
148 if (templateSeqs[m]->getName() == "63607") { bestfit[11] = *(templateSeqs[m]); }
149 if (templateSeqs[m]->getName() == "7056") { bestfit[10] = *(templateSeqs[m]); }
150 if (templateSeqs[m]->getName() == "7088") { bestfit[9] = *(templateSeqs[m]); }
151 if (templateSeqs[m]->getName() == "17553") { bestfit[8] = *(templateSeqs[m]); }
152 if (templateSeqs[m]->getName() == "131723") { bestfit[7] = *(templateSeqs[m]); }
153 if (templateSeqs[m]->getName() == "69013") { bestfit[6] = *(templateSeqs[m]); }
154 if (templateSeqs[m]->getName() == "24543") { bestfit[5] = *(templateSeqs[m]); }
155 if (templateSeqs[m]->getName() == "27824") { bestfit[4] = *(templateSeqs[m]); }
156 if (templateSeqs[m]->getName() == "1456") { bestfit[3] = *(templateSeqs[m]); }
157 if (templateSeqs[m]->getName() == "1456") { bestfit[2] = *(templateSeqs[m]); }
158 if (templateSeqs[m]->getName() == "141312") { bestfit[1] = *(templateSeqs[m]); }
159 if (templateSeqs[m]->getName() == "141312") { bestfit[0] = *(templateSeqs[m]); }
164 for (int j = 0; j < bestfit.size(); j++) {
165 //chops off beginning and end of sequences so they both start and end with a base
166 decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
168 mothurOut("Done."); mothurOutEndLine();
170 mothurOut("Finding window breaks... "); cout.flush();
171 for (int i = lines[0]->start; i < lines[0]->end; i++) {
172 it = trimmed[i].begin();
173 //cout << "trimmed = " << it->first << '\t' << it->second << endl;
174 vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
175 windowsForeachQuery[i] = win;
177 mothurOut("Done."); mothurOutEndLine();
179 }else { createProcessesSpots(); }
182 mothurOut("Getting conservation... "); cout.flush();
183 if (consfile == "") {
184 mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the quantiles to a .prob file so that you can input them using the conservation parameter next time you run this command. Providing the .prob file will dramatically improve speed. "); cout.flush();
185 probabilityProfile = decalc->calcFreq(templateSeqs, templateFile);
186 mothurOut("Done."); mothurOutEndLine();
187 }else { probabilityProfile = readFreq(); }
190 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; }
191 mothurOut("Done."); mothurOutEndLine();
193 if (processors == 1) {
195 mothurOut("Calculating observed distance... "); cout.flush();
196 for (int i = lines[0]->start; i < lines[0]->end; i++) {
197 //cout << querySeqs[i]->getName() << '\t' << bestfit[i].getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl;
198 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
199 obsDistance[i] = obsi;
201 mothurOut("Done."); mothurOutEndLine();
204 mothurOut("Finding variability... "); cout.flush();
205 for (int i = lines[0]->start; i < lines[0]->end; i++) {
206 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
209 //cout << i+1 << endl;
210 //for (int j = 0; j < Qav[i].size(); j++) {
211 //cout << Qav[i][j] << '\t';
213 //cout << endl << endl;
216 mothurOut("Done."); mothurOutEndLine();
219 mothurOut("Calculating alpha... "); cout.flush();
220 for (int i = lines[0]->start; i < lines[0]->end; i++) {
221 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
222 //cout << i+1 << "\tcoef = " << alpha << endl;
225 mothurOut("Done."); mothurOutEndLine();
228 mothurOut("Calculating expected distance... "); cout.flush();
229 for (int i = lines[0]->start; i < lines[0]->end; i++) {
230 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
231 expectedDistance[i] = exp;
233 mothurOut("Done."); mothurOutEndLine();
236 mothurOut("Finding deviation... "); cout.flush();
237 for (int i = lines[0]->start; i < lines[0]->end; i++) {
238 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
241 it = trimmed[i].begin();
242 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
245 mothurOut("Done."); mothurOutEndLine();
248 else { createProcesses(); }
251 //quantiles are used to determine whether the de values found indicate a chimera
252 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
253 //combination of sequences in the template
254 if (quanfile != "") { quantiles = readQuantiles(); }
257 mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
258 if (processors == 1) {
259 quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
260 }else { createProcessesQuan(); }
263 string o = getRootName(templateFile) + "quan";
265 openOutputFile(o, out4);
268 for (int i = 0; i < quantiles.size(); i++) {
269 if (quantiles[i].size() == 0) {
270 //in case this is not a distance found in your template files
271 for (int g = 0; g < 6; g++) {
272 quantiles[i].push_back(0.0);
276 sort(quantiles[i].begin(), quantiles[i].end());
280 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]);
282 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)]);
284 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)]);
286 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)]);
288 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)]);
290 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]);
297 for (int u = 0; u < quantiles[i].size(); u++) { out4 << quantiles[i][u] << '\t'; }
302 mothurOut("Done."); mothurOutEndLine();
306 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
307 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
309 delete distcalculator;
312 catch(exception& e) {
313 errorOut(e, "Pintail", "getChimeras");
318 //***************************************************************************************************************
320 vector<float> Pintail::readFreq() {
324 openInputFile(consfile, in);
328 //read in probabilities and store in vector
335 //do you want this spot
345 catch(exception& e) {
346 errorOut(e, "Pintail", "readFreq");
351 //***************************************************************************************************************
353 vector< vector<float> > Pintail::readQuantiles() {
357 openInputFile(quanfile, in);
359 vector< vector<float> > quan;
361 int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
365 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
370 temp.push_back(twentyfive);
371 temp.push_back(fifty);
372 temp.push_back(seventyfive);
373 temp.push_back(ninetyfive);
374 temp.push_back(ninetynine);
376 quan.push_back(temp);
385 catch(exception& e) {
386 errorOut(e, "Pintail", "readQuantiles");
390 //***************************************************************************************************************
391 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
392 vector<Sequence> Pintail::findPairs(int start, int end) {
395 vector<Sequence> seqsMatches;
397 for(int i = start; i < end; i++){
399 float smallest = 10000.0;
400 Sequence query = *(querySeqs[i]);
403 for(int j = 0; j < templateSeqs.size(); j++){
405 Sequence temp = *(templateSeqs[j]);
407 distcalculator->calcDist(query, temp);
408 float dist = distcalculator->getDist();
410 if (dist < smallest) {
411 match = *(templateSeqs[j]);
416 seqsMatches.push_back(match);
422 catch(exception& e) {
423 errorOut(e, "Pintail", "findPairs");
428 /**************************************************************************************************/
430 void Pintail::createProcessesSpots() {
432 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
434 vector<int> processIDS;
436 //loop through and create all the processes you want
437 while (process != processors) {
441 processIDS.push_back(pid);
445 mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
446 bestfit = findPairs(lines[process]->start, lines[process]->end);
447 mothurOut("Done finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
449 int count = lines[process]->start;
450 for (int j = 0; j < bestfit.size(); j++) {
452 //chops off beginning and end of sequences so they both start and end with a base
454 decalc->trimSeqs(querySeqs[count], bestfit[j], trim);
455 trimmed[count] = trim;
460 mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
461 for (int i = lines[process]->start; i < lines[process]->end; i++) {
462 it = trimmed[i].begin();
463 windowsForeachQuery[i] = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
465 mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
467 //write out data to file so parent can read it
469 string s = toString(pid) + ".temp";
470 openOutputFile(s, out);
472 //output range and size
473 out << bestfit.size() << endl;
476 for (int i = 0; i < bestfit.size(); i++) {
477 out << ">" << bestfit[i].getName() << endl << bestfit[i].getAligned() << endl;
480 //output windowsForeachQuery
481 for (int i = 0; i < windowsForeachQuery.size(); i++) {
482 out << windowsForeachQuery[i].size() << '\t';
483 for (int j = 0; j < windowsForeachQuery[i].size(); j++) {
484 out << windowsForeachQuery[i][j] << '\t';
490 for (int i = 0; i < windowSizes.size(); i++) {
491 out << windowSizes[i] << '\t';
497 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
500 //force parent to wait until all the processes are done
501 for (int i=0;i<processors;i++) {
502 int temp = processIDS[i];
506 //get data created by processes
507 for (int i=0;i<processors;i++) {
509 string s = toString(processIDS[i]) + ".temp";
510 openInputFile(s, in);
513 in >> size; gobble(in);
516 int count = lines[i]->start;
517 for (int m = 0; m < size; m++) {
519 bestfit[count] = temp;
527 count = lines[i]->start;
528 for (int m = 0; m < size; m++) {
532 vector<int> win; int w;
533 for (int j = 0; j < num; j++) {
538 windowsForeachQuery[count] = win;
544 count = lines[i]->start;
545 for (int i = 0; i < size; i++) {
549 windowSizes[count] = num;
558 bestfit = findPairs(lines[0]->start, lines[0]->end);
559 for (int j = 0; j < bestfit.size(); j++) {
560 //chops off beginning and end of sequences so they both start and end with a base
561 decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
564 for (int i = lines[0]->start; i < lines[0]->end; i++) {
565 it = trimmed[i].begin();
566 map<int, int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
572 catch(exception& e) {
573 errorOut(e, "Pintail", "createProcessesSpots");
579 /**************************************************************************************************/
581 void Pintail::createProcesses() {
583 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
585 vector<int> processIDS;
587 vector< vector<float> > exp; exp.resize(querySeqs.size());
588 vector<float> de; de.resize(querySeqs.size());
589 vector< vector<float> > obs; obs.resize(querySeqs.size());
590 vector<float> dev; dev.resize(querySeqs.size());
593 //loop through and create all the processes you want
594 while (process != processors) {
598 processIDS.push_back(pid);
602 mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
603 for (int i = lines[process]->start; i < lines[process]->end; i++) {
605 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
609 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
612 float alpha = decalc->getCoef(obsDistance[i], q);
615 vector<float> exp = decalc->calcExpected(q, alpha);
616 expectedDistance[i] = exp;
618 //get de and deviation
619 float dei = decalc->calcDE(obsi, exp);
622 it = trimmed[i].begin();
623 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
626 mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
629 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
632 //force parent to wait until all the processes are done
633 for (int i=0;i<processors;i++) {
634 int temp = processIDS[i];
639 expectedDistance = exp;
644 mothurOut("Calculating observed distance... "); cout.flush();
645 for (int i = lines[0]->start; i < lines[0]->end; i++) {
646 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]);
647 obsDistance[i] = obsi;
649 mothurOut("Done."); mothurOutEndLine();
653 mothurOut("Finding variability... "); cout.flush();
654 for (int i = lines[0]->start; i < lines[0]->end; i++) {
655 vector<float> q = decalc->findQav(windows[i], windowSizes[i], probabilityProfile, h[i]);
658 mothurOut("Done."); mothurOutEndLine();
662 mothurOut("Calculating alpha... "); cout.flush();
663 for (int i = lines[0]->start; i < lines[0]->end; i++) {
664 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
665 seqCoef.push_back(alpha);
667 mothurOut("Done."); mothurOutEndLine();
671 mothurOut("Calculating expected distance... "); cout.flush();
672 for (int i = lines[0]->start; i < lines[0]->end; i++) {
673 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
674 expectedDistance[i] = exp;
676 mothurOut("Done."); mothurOutEndLine();
680 mothurOut("Finding deviation... "); cout.flush();
681 for (int i = lines[0]->start; i < lines[0]->end; i++) {
682 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
685 it = trimmed[i].begin();
686 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
689 mothurOut("Done."); mothurOutEndLine();
693 catch(exception& e) {
694 errorOut(e, "Pintail", "createProcesses");
700 /**************************************************************************************************/
702 void Pintail::createProcessesQuan() {
704 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
706 vector<int> processIDS;
707 vector< vector<float> > quan; quan.resize(100);
709 //loop through and create all the processes you want
710 while (process != processors) {
714 processIDS.push_back(pid);
718 vector< vector<float> > q = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
720 for (int i = 0; i < q.size(); i++) {
721 //put all values of q[i] into quan[i]
722 quan[i].insert(quan[i].begin(), q[i].begin(), q[i].end());
725 for (int i = 0; i < quan.size(); i++) {
727 for (int j = 0; j < quan[i].size(); j++) { cout << quan[i][j] << '\t'; }
732 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
735 //force parent to wait until all the processes are done
736 for (int i=0;i<processors;i++) {
737 int temp = processIDS[i];
743 quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
746 catch(exception& e) {
747 errorOut(e, "Pintail", "createProcessesQuan");
753 //***************************************************************************************************************