5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
13 //***************************************************************************************************************
15 Pintail::Pintail(string filename, string temp) { fastafile = filename; templateFile = temp; }
16 //***************************************************************************************************************
20 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
21 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
22 for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; }
25 errorOut(e, "Pintail", "~Pintail");
29 //***************************************************************************************************************
30 void Pintail::print(ostream& out) {
35 for (int i = 0; i < querySeqs.size(); i++) {
37 int index = ceil(deviation[i]);
39 //is your DE value higher than the 95%
41 if (DE[i] > quantiles[index][4]) { chimera = "Yes"; }
42 else { chimera = "No"; }
44 out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
45 if (chimera == "Yes") {
46 mothurOut(querySeqs[i]->getName() + "\tdiv: " + toString(deviation[i]) + "\tstDev: " + toString(DE[i]) + "\tchimera flag: " + chimera); mothurOutEndLine();
50 for (int j = 0; j < obsDistance[i].size(); j++) { out << obsDistance[i][j] << '\t'; }
55 for (int m = 0; m < expectedDistance[i].size(); m++) { out << expectedDistance[i][m] << '\t'; }
61 errorOut(e, "Pintail", "print");
66 //***************************************************************************************************************
67 void Pintail::getChimeras() {
70 //read in query sequences and subject sequences
71 mothurOut("Reading sequences and template file... "); cout.flush();
72 querySeqs = readSeqs(fastafile);
73 templateSeqs = readSeqs(templateFile);
74 mothurOut("Done."); mothurOutEndLine();
76 int numSeqs = querySeqs.size();
78 obsDistance.resize(numSeqs);
79 expectedDistance.resize(numSeqs);
80 seqCoef.resize(numSeqs);
83 bestfit.resize(numSeqs);
84 deviation.resize(numSeqs);
85 trimmed.resize(numSeqs);
86 windowSizes.resize(numSeqs, window);
87 windowSizesTemplate.resize(templateSeqs.size(), window);
88 windowsForeachQuery.resize(numSeqs);
90 quantiles.resize(100); //one for every percent mismatch
91 quantilesMembers.resize(100); //one for every percent mismatch
92 makeCompliant.resize(templateSeqs.size(), 0.0);
94 //break up file if needed
95 int linesPerProcess = numSeqs / processors ;
97 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
98 //find breakup of sequences for all times we will Parallelize
99 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
102 for (int i = 0; i < (processors-1); i++) {
103 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
105 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
106 int i = processors - 1;
107 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
110 //find breakup of templatefile for quantiles
111 if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
113 for (int i = 0; i < processors; i++) {
114 templateLines.push_back(new linePair());
115 templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
116 templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
120 lines.push_back(new linePair(0, numSeqs));
121 templateLines.push_back(new linePair(0, templateSeqs.size()));
124 distcalculator = new ignoreGaps();
125 decalc = new DeCalculator();
127 //if the user does enter a mask then you want to keep all the spots in the alignment
128 if (seqMask.length() == 0) { decalc->setAlignmentLength(querySeqs[0]->getAligned().length()); }
129 else { decalc->setAlignmentLength(seqMask.length()); }
131 decalc->setMask(seqMask);
134 if (processors == 1) {
135 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
136 bestfit = findPairs(lines[0]->start, lines[0]->end);
137 mothurOut("Done."); mothurOutEndLine();
138 }else { createProcessesPairs(); }
142 mothurOut("Getting conservation... "); cout.flush();
143 if (consfile == "") {
144 mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
145 probabilityProfile = decalc->calcFreq(templateSeqs, templateFile);
146 mothurOut("Done."); mothurOutEndLine();
147 }else { probabilityProfile = readFreq(); }
150 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
151 mothurOut("Done."); mothurOutEndLine();
153 //mask sequences if the user wants to
156 for (int i = 0; i < querySeqs.size(); i++) {
157 decalc->runMask(querySeqs[i]);
161 for (int i = 0; i < templateSeqs.size(); i++) {
162 decalc->runMask(templateSeqs[i]);
165 for (int i = 0; i < bestfit.size(); i++) {
166 decalc->runMask(bestfit[i]);
171 if (processors == 1) {
173 for (int j = 0; j < bestfit.size(); j++) {
174 decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
177 mothurOut("Finding window breaks... "); cout.flush();
178 for (int i = lines[0]->start; i < lines[0]->end; i++) {
179 it = trimmed[i].begin();
180 vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
181 windowsForeachQuery[i] = win;
183 mothurOut("Done."); mothurOutEndLine();
185 }else { createProcessesSpots(); }
187 if (processors == 1) {
189 mothurOut("Calculating observed distance... "); cout.flush();
190 for (int i = lines[0]->start; i < lines[0]->end; i++) {
191 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
193 obsDistance[i] = obsi;
195 mothurOut("Done."); mothurOutEndLine();
198 mothurOut("Finding variability... "); cout.flush();
199 for (int i = lines[0]->start; i < lines[0]->end; i++) {
200 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
204 mothurOut("Done."); mothurOutEndLine();
207 mothurOut("Calculating alpha... "); cout.flush();
208 for (int i = lines[0]->start; i < lines[0]->end; i++) {
209 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
212 mothurOut("Done."); mothurOutEndLine();
215 mothurOut("Calculating expected distance... "); cout.flush();
216 for (int i = lines[0]->start; i < lines[0]->end; i++) {
217 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
218 expectedDistance[i] = exp;
220 mothurOut("Done."); mothurOutEndLine();
223 mothurOut("Finding deviation... "); cout.flush();
224 for (int i = lines[0]->start; i < lines[0]->end; i++) {
225 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
228 it = trimmed[i].begin();
229 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
232 mothurOut("Done."); mothurOutEndLine();
235 else { createProcesses(); }
237 //quantiles are used to determine whether the de values found indicate a chimera
238 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
239 //combination of sequences in the template
240 if (quanfile != "") { quantiles = readQuantiles(); }
243 mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
244 if (processors == 1) {
245 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant);
246 }else { createProcessesQuan(); }
249 //decided against this because we were having trouble setting the sensitivity... may want to revisit this...
250 //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
255 o = getRootName(templateFile) + "quan";
257 openOutputFile(o, out4);
260 for (int i = 0; i < quantiles.size(); i++) {
263 if (quantiles[i].size() == 0) {
264 //in case this is not a distance found in your template files
265 for (int g = 0; g < 6; g++) {
270 sort(quantiles[i].begin(), quantiles[i].end());
273 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]);
275 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)]);
277 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)]);
279 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)]);
281 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)]);
283 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]);
289 for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; }
298 mothurOut("Done."); mothurOutEndLine();
303 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
304 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
306 delete distcalculator;
309 catch(exception& e) {
310 errorOut(e, "Pintail", "getChimeras");
315 //***************************************************************************************************************
317 vector<float> Pintail::readFreq() {
321 openInputFile(consfile, in);
324 set<int> h = decalc->getPos(); //positions of bases in masking sequence
326 //read in probabilities and store in vector
333 if (h.count(pos) > 0) {
335 Pi = (num - 0.25) / 0.75;
337 //cannot have probability less than 0.
338 if (Pi < 0) { Pi = 0.0; }
340 //do you want this spot
351 catch(exception& e) {
352 errorOut(e, "Pintail", "readFreq");
357 //***************************************************************************************************************
359 vector< vector<float> > Pintail::readQuantiles() {
363 openInputFile(quanfile, in);
365 vector< vector<float> > quan;
367 int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
371 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
376 temp.push_back(twentyfive);
377 temp.push_back(fifty);
378 temp.push_back(seventyfive);
379 temp.push_back(ninetyfive);
380 temp.push_back(ninetynine);
382 quan.push_back(temp);
391 catch(exception& e) {
392 errorOut(e, "Pintail", "readQuantiles");
396 //***************************************************************************************************************
397 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
398 vector<Sequence*> Pintail::findPairs(int start, int end) {
401 vector<Sequence*> seqsMatches;
403 for(int i = start; i < end; i++){
405 float smallest = 10000.0;
406 Sequence query = *(querySeqs[i]);
409 for(int j = 0; j < templateSeqs.size(); j++){
411 Sequence temp = *(templateSeqs[j]);
413 distcalculator->calcDist(query, temp);
414 float dist = distcalculator->getDist();
416 if (dist < smallest) {
417 match = templateSeqs[j];
422 //make copy so trimSeqs doesn't overtrim
423 Sequence* copy = new Sequence(match->getName(), match->getAligned());
425 seqsMatches.push_back(copy);
431 catch(exception& e) {
432 errorOut(e, "Pintail", "findPairs");
437 /**************************************************************************************************/
439 void Pintail::createProcessesSpots() {
441 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
443 vector<int> processIDS;
445 //loop through and create all the processes you want
446 while (process != processors) {
450 processIDS.push_back(pid);
454 for (int j = lines[process]->start; j < lines[process]->end; j++) {
456 //chops off beginning and end of sequences so they both start and end with a base
459 decalc->trimSeqs(querySeqs[j], bestfit[j], trim);
464 mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
465 for (int i = lines[process]->start; i < lines[process]->end; i++) {
466 it = trimmed[i].begin();
467 windowsForeachQuery[i] = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
469 mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
471 //write out data to file so parent can read it
473 string s = toString(getpid()) + ".temp";
474 openOutputFile(s, out);
476 //output windowsForeachQuery
477 for (int i = lines[process]->start; i < lines[process]->end; i++) {
478 out << windowsForeachQuery[i].size() << '\t';
479 for (int j = 0; j < windowsForeachQuery[i].size(); j++) {
480 out << windowsForeachQuery[i][j] << '\t';
486 for (int i = lines[process]->start; i < lines[process]->end; i++) {
487 out << windowSizes[i] << '\t';
491 //output trimmed values
492 for (int i = lines[process]->start; i < lines[process]->end; i++) {
493 it = trimmed[i].begin();
494 out << it->first << '\t' << it->second << endl;
499 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
502 //force parent to wait until all the processes are done
503 for (int i=0;i<processors;i++) {
504 int temp = processIDS[i];
508 //get data created by processes
509 for (int i=0;i<processors;i++) {
511 string s = toString(processIDS[i]) + ".temp";
512 openInputFile(s, in);
514 int size = lines[i]->end - lines[i]->start;
516 int count = lines[i]->start;
517 for (int m = 0; m < size; m++) {
521 vector<int> win; int w;
522 for (int j = 0; j < num; j++) {
527 windowsForeachQuery[count] = win;
533 count = lines[i]->start;
534 for (int m = 0; m < size; m++) {
538 windowSizes[count] = num;
544 count = lines[i]->start;
545 for (int m = 0; m < size; m++) {
566 for (int j = 0; j < bestfit.size(); j++) {
567 //chops off beginning and end of sequences so they both start and end with a base
568 decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
571 for (int i = lines[0]->start; i < lines[0]->end; i++) {
572 it = trimmed[i].begin();
573 vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
574 windowsForeachQuery[i] = win;
579 catch(exception& e) {
580 errorOut(e, "Pintail", "createProcessesSpots");
584 /**************************************************************************************************/
586 void Pintail::createProcessesPairs() {
588 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
590 vector<int> processIDS;
592 //loop through and create all the processes you want
593 while (process != processors) {
597 processIDS.push_back(pid);
601 mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
602 bestfit = findPairs(lines[process]->start, lines[process]->end);
603 mothurOut("Done finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
605 //write out data to file so parent can read it
607 string s = toString(getpid()) + ".temp";
608 openOutputFile(s, out);
610 //output range and size
611 out << bestfit.size() << endl;
614 for (int i = 0; i < bestfit.size(); i++) {
615 out << ">" << bestfit[i]->getName() << endl << bestfit[i]->getAligned() << endl;
620 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
623 //force parent to wait until all the processes are done
624 for (int i=0;i<processors;i++) {
625 int temp = processIDS[i];
629 //get data created by processes
630 for (int i=0;i<processors;i++) {
632 string s = toString(processIDS[i]) + ".temp";
633 openInputFile(s, in);
636 in >> size; gobble(in);
639 int count = lines[i]->start;
640 for (int m = 0; m < size; m++) {
641 Sequence* temp = new Sequence(in);
642 bestfit[count] = temp;
654 bestfit = findPairs(lines[0]->start, lines[0]->end);
657 catch(exception& e) {
658 errorOut(e, "Pintail", "createProcessesPairs");
662 /**************************************************************************************************/
664 void Pintail::createProcesses() {
666 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
668 vector<int> processIDS;
670 //loop through and create all the processes you want
671 while (process != processors) {
675 processIDS.push_back(pid);
679 mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
680 for (int i = lines[process]->start; i < lines[process]->end; i++) {
682 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
683 obsDistance[i] = obsi;
686 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
689 float alpha = decalc->getCoef(obsDistance[i], q);
692 vector<float> exp = decalc->calcExpected(q, alpha);
693 expectedDistance[i] = exp;
695 //get de and deviation
696 float dei = decalc->calcDE(obsi, exp);
699 it = trimmed[i].begin();
700 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
703 mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
705 //write out data to file so parent can read it
707 string s = toString(getpid()) + ".temp";
708 openOutputFile(s, out);
710 int size = lines[process]->end - lines[process]->start;
713 //output observed distances
714 for (int i = lines[process]->start; i < lines[process]->end; i++) {
715 out << obsDistance[i].size() << '\t';
716 for (int j = 0; j < obsDistance[i].size(); j++) {
717 out << obsDistance[i][j] << '\t';
723 //output expected distances
724 for (int i = lines[process]->start; i < lines[process]->end; i++) {
725 out << expectedDistance[i].size() << '\t';
726 for (int j = 0; j < expectedDistance[i].size(); j++) {
727 out << expectedDistance[i][j] << '\t';
734 for (int i = lines[process]->start; i < lines[process]->end; i++) {
735 out << DE[i] << '\t';
740 for (int i = lines[process]->start; i < lines[process]->end; i++) {
741 out << deviation[i] << '\t';
748 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
751 //force parent to wait until all the processes are done
752 for (int i=0;i<processors;i++) {
753 int temp = processIDS[i];
757 //get data created by processes
758 for (int i=0;i<processors;i++) {
760 string s = toString(processIDS[i]) + ".temp";
761 openInputFile(s, in);
764 in >> size; gobble(in);
766 //get observed distances
767 int count = lines[i]->start;
768 for (int m = 0; m < size; m++) {
772 vector<float> obs; float w;
773 for (int j = 0; j < num; j++) {
778 obsDistance[count] = obs;
785 //get expected distances
786 count = lines[i]->start;
787 for (int m = 0; m < size; m++) {
791 vector<float> exp; float w;
792 for (int j = 0; j < num; j++) {
797 expectedDistance[count] = exp;
804 count = lines[i]->start;
805 for (int m = 0; m < size; m++) {
815 count = lines[i]->start;
816 for (int m = 0; m < size; m++) {
820 deviation[count] = num;
830 mothurOut("Calculating observed distance... "); cout.flush();
831 for (int i = lines[0]->start; i < lines[0]->end; i++) {
832 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
833 obsDistance[i] = obsi;
835 mothurOut("Done."); mothurOutEndLine();
839 mothurOut("Finding variability... "); cout.flush();
840 for (int i = lines[0]->start; i < lines[0]->end; i++) {
841 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
844 mothurOut("Done."); mothurOutEndLine();
848 mothurOut("Calculating alpha... "); cout.flush();
849 for (int i = lines[0]->start; i < lines[0]->end; i++) {
850 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
851 seqCoef.push_back(alpha);
853 mothurOut("Done."); mothurOutEndLine();
857 mothurOut("Calculating expected distance... "); cout.flush();
858 for (int i = lines[0]->start; i < lines[0]->end; i++) {
859 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
860 expectedDistance[i] = exp;
862 mothurOut("Done."); mothurOutEndLine();
866 mothurOut("Finding deviation... "); cout.flush();
867 for (int i = lines[0]->start; i < lines[0]->end; i++) {
868 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
871 it = trimmed[i].begin();
872 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
875 mothurOut("Done."); mothurOutEndLine();
879 catch(exception& e) {
880 errorOut(e, "Pintail", "createProcesses");
886 /**************************************************************************************************/
888 void Pintail::createProcessesQuan() {
890 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
892 vector<int> processIDS;
894 //loop through and create all the processes you want
895 while (process != processors) {
899 processIDS.push_back(pid);
903 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end, makeCompliant);
905 //write out data to file so parent can read it
907 string s = toString(getpid()) + ".temp";
908 openOutputFile(s, out);
911 //output observed distances
912 for (int i = 0; i < quantilesMembers.size(); i++) {
913 out << quantilesMembers[i].size() << '\t';
914 for (int j = 0; j < quantilesMembers[i].size(); j++) {
915 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
923 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
926 //force parent to wait until all the processes are done
927 for (int i=0;i<processors;i++) {
928 int temp = processIDS[i];
932 //get data created by processes
933 for (int i=0;i<processors;i++) {
935 string s = toString(processIDS[i]) + ".temp";
936 openInputFile(s, in);
938 vector< vector<quanMember> > quan;
942 for (int m = 0; m < quan.size(); m++) {
948 vector<quanMember> q; float w; int b, n;
949 for (int j = 0; j < num; j++) {
951 //cout << w << '\t' << b << '\t' n << endl;
952 quanMember newMember(w, b, n);
953 q.push_back(newMember);
955 //cout << "here" << endl;
957 //cout << "now here" << endl;
962 //save quan in quantiles
963 for (int j = 0; j < quan.size(); j++) {
964 //put all values of q[i] into quan[i]
965 for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
966 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
974 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant);
977 catch(exception& e) {
978 errorOut(e, "Pintail", "createProcessesQuan");
984 //***************************************************************************************************************