5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
13 //********************************************************************************************************************
14 //sorts lowest to highest
15 inline bool compareQuanMembers(quanMember left, quanMember right){
16 return (left.score < right.score);
18 //***************************************************************************************************************
20 Pintail::Pintail(string filename, string temp) { fastafile = filename; templateFile = temp; }
21 //***************************************************************************************************************
25 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
26 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
27 for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; }
30 errorOut(e, "Pintail", "~Pintail");
34 //***************************************************************************************************************
35 void Pintail::print(ostream& out) {
40 for (int i = 0; i < querySeqs.size(); i++) {
42 int index = ceil(deviation[i]);
44 //is your DE value higher than the 95%
46 if (DE[i] > quantiles[index][4]) { chimera = "Yes"; }
47 else { chimera = "No"; }
49 out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
50 if (chimera == "Yes") {
51 mothurOut(querySeqs[i]->getName() + "\tdiv: " + toString(deviation[i]) + "\tstDev: " + toString(DE[i]) + "\tchimera flag: " + chimera); mothurOutEndLine();
55 for (int j = 0; j < obsDistance[i].size(); j++) { out << obsDistance[i][j] << '\t'; }
60 for (int m = 0; m < expectedDistance[i].size(); m++) { out << expectedDistance[i][m] << '\t'; }
66 errorOut(e, "Pintail", "print");
71 //***************************************************************************************************************
72 void Pintail::getChimeras() {
75 //read in query sequences and subject sequences
76 mothurOut("Reading sequences and template file... "); cout.flush();
77 querySeqs = readSeqs(fastafile);
78 templateSeqs = readSeqs(templateFile);
79 mothurOut("Done."); mothurOutEndLine();
81 int numSeqs = querySeqs.size();
83 obsDistance.resize(numSeqs);
84 expectedDistance.resize(numSeqs);
85 seqCoef.resize(numSeqs);
88 bestfit.resize(numSeqs);
89 deviation.resize(numSeqs);
90 trimmed.resize(numSeqs);
91 windowSizes.resize(numSeqs, window);
92 windowSizesTemplate.resize(templateSeqs.size(), window);
93 windowsForeachQuery.resize(numSeqs);
95 quantiles.resize(100); //one for every percent mismatch
96 quantilesMembers.resize(100); //one for every percent mismatch
98 //break up file if needed
99 int linesPerProcess = numSeqs / processors ;
101 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
102 //find breakup of sequences for all times we will Parallelize
103 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
106 for (int i = 0; i < (processors-1); i++) {
107 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
109 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
110 int i = processors - 1;
111 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
114 //find breakup of templatefile for quantiles
115 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
117 for (int i = 0; i < processors; i++) {
118 templateLines.push_back(new linePair());
119 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
120 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
124 lines.push_back(new linePair(0, numSeqs));
125 templateLines.push_back(new linePair(0, templateSeqs.size()));
128 distcalculator = new ignoreGaps();
129 decalc = new DeCalculator();
131 //if the user does enter a mask then you want to keep all the spots in the alignment
132 if (seqMask.length() == 0) { decalc->setAlignmentLength(querySeqs[0]->getAligned().length()); }
133 else { decalc->setAlignmentLength(seqMask.length()); }
135 decalc->setMask(seqMask);
138 if (processors == 1) {
139 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
140 bestfit = findPairs(lines[0]->start, lines[0]->end);
141 mothurOut("Done."); mothurOutEndLine();
142 }else { createProcessesPairs(); }
146 mothurOut("Getting conservation... "); cout.flush();
147 if (consfile == "") {
148 mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
149 probabilityProfile = decalc->calcFreq(templateSeqs, templateFile);
150 mothurOut("Done."); mothurOutEndLine();
151 }else { probabilityProfile = readFreq(); }
154 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
155 mothurOut("Done."); mothurOutEndLine();
157 //mask sequences if the user wants to
160 for (int i = 0; i < querySeqs.size(); i++) {
161 decalc->runMask(querySeqs[i]);
165 for (int i = 0; i < templateSeqs.size(); i++) {
166 decalc->runMask(templateSeqs[i]);
169 for (int i = 0; i < bestfit.size(); i++) {
170 decalc->runMask(bestfit[i]);
175 if (processors == 1) {
177 for (int j = 0; j < bestfit.size(); j++) {
178 decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
181 mothurOut("Finding window breaks... "); cout.flush();
182 for (int i = lines[0]->start; i < lines[0]->end; i++) {
183 it = trimmed[i].begin();
184 vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
185 windowsForeachQuery[i] = win;
187 mothurOut("Done."); mothurOutEndLine();
189 }else { createProcessesSpots(); }
191 if (processors == 1) {
193 mothurOut("Calculating observed distance... "); cout.flush();
194 for (int i = lines[0]->start; i < lines[0]->end; i++) {
195 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
197 obsDistance[i] = obsi;
199 mothurOut("Done."); mothurOutEndLine();
202 mothurOut("Finding variability... "); cout.flush();
203 for (int i = lines[0]->start; i < lines[0]->end; i++) {
204 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
208 mothurOut("Done."); mothurOutEndLine();
211 mothurOut("Calculating alpha... "); cout.flush();
212 for (int i = lines[0]->start; i < lines[0]->end; i++) {
213 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
216 mothurOut("Done."); mothurOutEndLine();
219 mothurOut("Calculating expected distance... "); cout.flush();
220 for (int i = lines[0]->start; i < lines[0]->end; i++) {
221 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
222 expectedDistance[i] = exp;
224 mothurOut("Done."); mothurOutEndLine();
227 mothurOut("Finding deviation... "); cout.flush();
228 for (int i = lines[0]->start; i < lines[0]->end; i++) {
229 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
232 it = trimmed[i].begin();
233 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
236 mothurOut("Done."); mothurOutEndLine();
239 else { createProcesses(); }
241 //quantiles are used to determine whether the de values found indicate a chimera
242 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
243 //combination of sequences in the template
244 if (quanfile != "") { quantiles = readQuantiles(); }
247 mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
248 if (processors == 1) {
249 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
250 }else { createProcessesQuan(); }
255 //decided against this because we were having trouble setting the sensitivity... may want to revisit this...
256 //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
261 o = getRootName(templateFile) + "quan";
263 openOutputFile(o, out4);
266 for (int i = 0; i < quantilesMembers.size(); i++) {
269 if (quantilesMembers[i].size() == 0) {
270 //in case this is not a distance found in your template files
271 for (int g = 0; g < 6; g++) {
276 sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
279 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
281 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
283 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
285 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
287 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
289 temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
295 for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; }
304 mothurOut("Done."); mothurOutEndLine();
309 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
310 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
312 delete distcalculator;
315 catch(exception& e) {
316 errorOut(e, "Pintail", "getChimeras");
321 //***************************************************************************************************************
323 vector<float> Pintail::readFreq() {
327 openInputFile(consfile, in);
330 set<int> h = decalc->getPos(); //positions of bases in masking sequence
332 //read in probabilities and store in vector
339 if (h.count(pos) > 0) {
341 Pi = (num - 0.25) / 0.75;
343 //cannot have probability less than 0.
344 if (Pi < 0) { Pi = 0.0; }
346 //do you want this spot
357 catch(exception& e) {
358 errorOut(e, "Pintail", "readFreq");
363 //***************************************************************************************************************
365 vector< vector<float> > Pintail::readQuantiles() {
369 openInputFile(quanfile, in);
371 vector< vector<float> > quan;
373 int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
377 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
382 temp.push_back(twentyfive);
383 temp.push_back(fifty);
384 temp.push_back(seventyfive);
385 temp.push_back(ninetyfive);
386 temp.push_back(ninetynine);
388 quan.push_back(temp);
397 catch(exception& e) {
398 errorOut(e, "Pintail", "readQuantiles");
402 //***************************************************************************************************************
403 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
404 vector<Sequence*> Pintail::findPairs(int start, int end) {
407 vector<Sequence*> seqsMatches;
409 for(int i = start; i < end; i++){
411 float smallest = 10000.0;
412 Sequence query = *(querySeqs[i]);
415 for(int j = 0; j < templateSeqs.size(); j++){
417 Sequence temp = *(templateSeqs[j]);
419 distcalculator->calcDist(query, temp);
420 float dist = distcalculator->getDist();
422 if (dist < smallest) {
423 match = templateSeqs[j];
428 //make copy so trimSeqs doesn't overtrim
429 Sequence* copy = new Sequence(match->getName(), match->getAligned());
431 seqsMatches.push_back(copy);
437 catch(exception& e) {
438 errorOut(e, "Pintail", "findPairs");
443 /**************************************************************************************************/
445 void Pintail::createProcessesSpots() {
447 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
449 vector<int> processIDS;
451 //loop through and create all the processes you want
452 while (process != processors) {
456 processIDS.push_back(pid);
460 for (int j = lines[process]->start; j < lines[process]->end; j++) {
462 //chops off beginning and end of sequences so they both start and end with a base
465 decalc->trimSeqs(querySeqs[j], bestfit[j], trim);
470 mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
471 for (int i = lines[process]->start; i < lines[process]->end; i++) {
472 it = trimmed[i].begin();
473 windowsForeachQuery[i] = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
475 mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
477 //write out data to file so parent can read it
479 string s = toString(getpid()) + ".temp";
480 openOutputFile(s, out);
482 //output windowsForeachQuery
483 for (int i = lines[process]->start; i < lines[process]->end; i++) {
484 out << windowsForeachQuery[i].size() << '\t';
485 for (int j = 0; j < windowsForeachQuery[i].size(); j++) {
486 out << windowsForeachQuery[i][j] << '\t';
492 for (int i = lines[process]->start; i < lines[process]->end; i++) {
493 out << windowSizes[i] << '\t';
497 //output trimmed values
498 for (int i = lines[process]->start; i < lines[process]->end; i++) {
499 it = trimmed[i].begin();
500 out << it->first << '\t' << it->second << endl;
505 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
508 //force parent to wait until all the processes are done
509 for (int i=0;i<processors;i++) {
510 int temp = processIDS[i];
514 //get data created by processes
515 for (int i=0;i<processors;i++) {
517 string s = toString(processIDS[i]) + ".temp";
518 openInputFile(s, in);
520 int size = lines[i]->end - lines[i]->start;
522 int count = lines[i]->start;
523 for (int m = 0; m < size; m++) {
527 vector<int> win; int w;
528 for (int j = 0; j < num; j++) {
533 windowsForeachQuery[count] = win;
539 count = lines[i]->start;
540 for (int m = 0; m < size; m++) {
544 windowSizes[count] = num;
550 count = lines[i]->start;
551 for (int m = 0; m < size; m++) {
572 for (int j = 0; j < bestfit.size(); j++) {
573 //chops off beginning and end of sequences so they both start and end with a base
574 decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
577 for (int i = lines[0]->start; i < lines[0]->end; i++) {
578 it = trimmed[i].begin();
579 vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
580 windowsForeachQuery[i] = win;
585 catch(exception& e) {
586 errorOut(e, "Pintail", "createProcessesSpots");
590 /**************************************************************************************************/
592 void Pintail::createProcessesPairs() {
594 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
596 vector<int> processIDS;
598 //loop through and create all the processes you want
599 while (process != processors) {
603 processIDS.push_back(pid);
607 mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
608 bestfit = findPairs(lines[process]->start, lines[process]->end);
609 mothurOut("Done finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
611 //write out data to file so parent can read it
613 string s = toString(getpid()) + ".temp";
614 openOutputFile(s, out);
616 //output range and size
617 out << bestfit.size() << endl;
620 for (int i = 0; i < bestfit.size(); i++) {
621 out << ">" << bestfit[i]->getName() << endl << bestfit[i]->getAligned() << endl;
626 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
629 //force parent to wait until all the processes are done
630 for (int i=0;i<processors;i++) {
631 int temp = processIDS[i];
635 //get data created by processes
636 for (int i=0;i<processors;i++) {
638 string s = toString(processIDS[i]) + ".temp";
639 openInputFile(s, in);
642 in >> size; gobble(in);
645 int count = lines[i]->start;
646 for (int m = 0; m < size; m++) {
647 Sequence* temp = new Sequence(in);
648 bestfit[count] = temp;
660 bestfit = findPairs(lines[0]->start, lines[0]->end);
663 catch(exception& e) {
664 errorOut(e, "Pintail", "createProcessesPairs");
668 /**************************************************************************************************/
670 void Pintail::createProcesses() {
672 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
674 vector<int> processIDS;
676 //loop through and create all the processes you want
677 while (process != processors) {
681 processIDS.push_back(pid);
685 mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
686 for (int i = lines[process]->start; i < lines[process]->end; i++) {
688 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
689 obsDistance[i] = obsi;
692 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
695 float alpha = decalc->getCoef(obsDistance[i], q);
698 vector<float> exp = decalc->calcExpected(q, alpha);
699 expectedDistance[i] = exp;
701 //get de and deviation
702 float dei = decalc->calcDE(obsi, exp);
705 it = trimmed[i].begin();
706 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
709 mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
711 //write out data to file so parent can read it
713 string s = toString(getpid()) + ".temp";
714 openOutputFile(s, out);
716 int size = lines[process]->end - lines[process]->start;
719 //output observed distances
720 for (int i = lines[process]->start; i < lines[process]->end; i++) {
721 out << obsDistance[i].size() << '\t';
722 for (int j = 0; j < obsDistance[i].size(); j++) {
723 out << obsDistance[i][j] << '\t';
729 //output expected distances
730 for (int i = lines[process]->start; i < lines[process]->end; i++) {
731 out << expectedDistance[i].size() << '\t';
732 for (int j = 0; j < expectedDistance[i].size(); j++) {
733 out << expectedDistance[i][j] << '\t';
740 for (int i = lines[process]->start; i < lines[process]->end; i++) {
741 out << DE[i] << '\t';
746 for (int i = lines[process]->start; i < lines[process]->end; i++) {
747 out << deviation[i] << '\t';
754 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
757 //force parent to wait until all the processes are done
758 for (int i=0;i<processors;i++) {
759 int temp = processIDS[i];
763 //get data created by processes
764 for (int i=0;i<processors;i++) {
766 string s = toString(processIDS[i]) + ".temp";
767 openInputFile(s, in);
770 in >> size; gobble(in);
772 //get observed distances
773 int count = lines[i]->start;
774 for (int m = 0; m < size; m++) {
778 vector<float> obs; float w;
779 for (int j = 0; j < num; j++) {
784 obsDistance[count] = obs;
791 //get expected distances
792 count = lines[i]->start;
793 for (int m = 0; m < size; m++) {
797 vector<float> exp; float w;
798 for (int j = 0; j < num; j++) {
803 expectedDistance[count] = exp;
810 count = lines[i]->start;
811 for (int m = 0; m < size; m++) {
821 count = lines[i]->start;
822 for (int m = 0; m < size; m++) {
826 deviation[count] = num;
836 mothurOut("Calculating observed distance... "); cout.flush();
837 for (int i = lines[0]->start; i < lines[0]->end; i++) {
838 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
839 obsDistance[i] = obsi;
841 mothurOut("Done."); mothurOutEndLine();
845 mothurOut("Finding variability... "); cout.flush();
846 for (int i = lines[0]->start; i < lines[0]->end; i++) {
847 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
850 mothurOut("Done."); mothurOutEndLine();
854 mothurOut("Calculating alpha... "); cout.flush();
855 for (int i = lines[0]->start; i < lines[0]->end; i++) {
856 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
857 seqCoef.push_back(alpha);
859 mothurOut("Done."); mothurOutEndLine();
863 mothurOut("Calculating expected distance... "); cout.flush();
864 for (int i = lines[0]->start; i < lines[0]->end; i++) {
865 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
866 expectedDistance[i] = exp;
868 mothurOut("Done."); mothurOutEndLine();
872 mothurOut("Finding deviation... "); cout.flush();
873 for (int i = lines[0]->start; i < lines[0]->end; i++) {
874 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
877 it = trimmed[i].begin();
878 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
881 mothurOut("Done."); mothurOutEndLine();
885 catch(exception& e) {
886 errorOut(e, "Pintail", "createProcesses");
892 /**************************************************************************************************/
894 void Pintail::createProcessesQuan() {
896 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
898 vector<int> processIDS;
900 //loop through and create all the processes you want
901 while (process != processors) {
905 processIDS.push_back(pid);
909 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
911 //write out data to file so parent can read it
913 string s = toString(getpid()) + ".temp";
914 openOutputFile(s, out);
917 //output observed distances
918 for (int i = 0; i < quantilesMembers.size(); i++) {
919 out << quantilesMembers[i].size() << '\t';
920 for (int j = 0; j < quantilesMembers[i].size(); j++) {
921 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
929 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
932 //force parent to wait until all the processes are done
933 for (int i=0;i<processors;i++) {
934 int temp = processIDS[i];
938 //get data created by processes
939 for (int i=0;i<processors;i++) {
941 string s = toString(processIDS[i]) + ".temp";
942 openInputFile(s, in);
944 vector< vector<quanMember> > quan;
948 for (int m = 0; m < quan.size(); m++) {
954 vector<quanMember> q; float w; int b, n;
955 for (int j = 0; j < num; j++) {
957 //cout << w << '\t' << b << '\t' n << endl;
958 quanMember newMember(w, b, n);
959 q.push_back(newMember);
961 //cout << "here" << endl;
963 //cout << "now here" << endl;
968 //save quan in quantiles
969 for (int j = 0; j < quan.size(); j++) {
970 //put all values of q[i] into quan[i]
971 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
972 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
980 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
983 catch(exception& e) {
984 errorOut(e, "Pintail", "createProcessesQuan");
990 //***************************************************************************************************************