5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
13 //***************************************************************************************************************
15 Pintail::Pintail(string filename, string temp) { fastafile = filename; templateFile = temp; }
16 //***************************************************************************************************************
20 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
21 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
24 errorOut(e, "Pintail", "~Pintail");
28 //***************************************************************************************************************
29 void Pintail::print(ostream& out) {
32 for (int i = 0; i < querySeqs.size(); i++) {
34 out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << endl;
37 for (int j = 0; j < obsDistance[i].size(); j++) { out << obsDistance[i][j] << '\t'; }
42 for (int m = 0; m < expectedDistance[i].size(); m++) { out << expectedDistance[i][m] << '\t'; }
48 errorOut(e, "Pintail", "print");
53 //***************************************************************************************************************
54 void Pintail::getChimeras() {
57 //read in query sequences and subject sequences
58 mothurOut("Reading sequences and template file... "); cout.flush();
59 querySeqs = readSeqs(fastafile);
60 templateSeqs = readSeqs(templateFile);
61 mothurOut("Done."); mothurOutEndLine();
63 int numSeqs = querySeqs.size();
65 obsDistance.resize(numSeqs);
66 expectedDistance.resize(numSeqs);
67 seqCoef.resize(numSeqs);
70 bestfit.resize(numSeqs);
71 deviation.resize(numSeqs);
72 trimmed.resize(numSeqs);
73 windowSizes.resize(numSeqs, window);
74 windows.resize(numSeqs);
76 //break up file if needed
77 int linesPerProcess = processors / numSeqs;
79 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
80 //find breakup of sequences for all times we will Parallelize
81 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
84 for (int i = 0; i < (processors-1); i++) {
85 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
87 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
88 int i = processors - 1;
89 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
92 lines.push_back(new linePair(0, numSeqs));
95 distcalculator = new ignoreGaps();
98 if (processors == 1) {
99 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
100 bestfit = findPairs(lines[0]->start, lines[0]->end);
101 /*for (int m = 0; m < templateSeqs.size(); m++) {
102 if (templateSeqs[m]->getName() == "159481") { bestfit[17] = *(templateSeqs[m]); }
103 if (templateSeqs[m]->getName() == "100137") { bestfit[16] = *(templateSeqs[m]); }
104 if (templateSeqs[m]->getName() == "112956") { bestfit[15] = *(templateSeqs[m]); }
105 if (templateSeqs[m]->getName() == "102326") { bestfit[14] = *(templateSeqs[m]); }
106 if (templateSeqs[m]->getName() == "66229") { bestfit[13] = *(templateSeqs[m]); }
107 if (templateSeqs[m]->getName() == "206276") { bestfit[12] = *(templateSeqs[m]); }
108 if (templateSeqs[m]->getName() == "63607") { bestfit[11] = *(templateSeqs[m]); }
109 if (templateSeqs[m]->getName() == "7056") { bestfit[10] = *(templateSeqs[m]); }
110 if (templateSeqs[m]->getName() == "7088") { bestfit[9] = *(templateSeqs[m]); }
111 if (templateSeqs[m]->getName() == "17553") { bestfit[8] = *(templateSeqs[m]); }
112 if (templateSeqs[m]->getName() == "131723") { bestfit[7] = *(templateSeqs[m]); }
113 if (templateSeqs[m]->getName() == "69013") { bestfit[6] = *(templateSeqs[m]); }
114 if (templateSeqs[m]->getName() == "24543") { bestfit[5] = *(templateSeqs[m]); }
115 if (templateSeqs[m]->getName() == "27824") { bestfit[4] = *(templateSeqs[m]); }
116 if (templateSeqs[m]->getName() == "1456") { bestfit[3] = *(templateSeqs[m]); }
117 if (templateSeqs[m]->getName() == "1456") { bestfit[2] = *(templateSeqs[m]); }
118 if (templateSeqs[m]->getName() == "141312") { bestfit[1] = *(templateSeqs[m]); }
119 if (templateSeqs[m]->getName() == "141312") { bestfit[0] = *(templateSeqs[m]); }
124 for (int j = 0; j < bestfit.size(); j++) {
125 //chops off beginning and end of sequences so they both start and end with a base
126 trimSeqs(querySeqs[j], bestfit[j], j);
128 mothurOut("Done."); mothurOutEndLine();
130 for (int i = lines[0]->start; i < lines[0]->end; i++) {
131 it = trimmed[i].begin();
132 vector<int> win = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]);
137 } else { createProcessesSpots(); }
140 if (consfile == "") { probabilityProfile = calcFreq(templateSeqs); }
141 else { probabilityProfile = readFreq(); }
144 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; }
146 if (processors == 1) {
148 mothurOut("Calculating observed distance... "); cout.flush();
149 for (int i = lines[0]->start; i < lines[0]->end; i++) {
150 vector<float> obsi = calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]);
151 obsDistance[i] = obsi;
153 mothurOut("Done."); mothurOutEndLine();
157 mothurOut("Finding variability... "); cout.flush();
158 for (int i = lines[0]->start; i < lines[0]->end; i++) {
159 vector<float> q = findQav(windows[i], windowSizes[i]);
162 mothurOut("Done."); mothurOutEndLine();
166 mothurOut("Calculating alpha... "); cout.flush();
167 for (int i = lines[0]->start; i < lines[0]->end; i++) {
168 float alpha = getCoef(obsDistance[i], Qav[i]);
169 seqCoef.push_back(alpha);
171 mothurOut("Done."); mothurOutEndLine();
175 mothurOut("Calculating expected distance... "); cout.flush();
176 for (int i = lines[0]->start; i < lines[0]->end; i++) {
177 vector<float> exp = calcExpected(Qav[i], seqCoef[i]);
178 expectedDistance[i] = exp;
180 mothurOut("Done."); mothurOutEndLine();
184 mothurOut("Finding deviation... "); cout.flush();
185 for (int i = lines[0]->start; i < lines[0]->end; i++) {
186 float de = calcDE(obsDistance[i], expectedDistance[i]);
189 it = trimmed[i].begin();
190 float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second);
193 mothurOut("Done."); mothurOutEndLine();
196 else { createProcesses(); }
198 delete distcalculator;
200 //quantiles are used to determine whether the de values found indicate a chimera
201 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each
202 //combination of sequences in the template
203 if (quanfile != "") { readQuantiles(); }
205 if (processors == 1) {
206 quantiles = getQuantiles(lines[0]->start, lines[0]->end);
207 }else { createProcessesQuan(); }
212 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
216 catch(exception& e) {
217 errorOut(e, "Pintail", "getChimeras");
222 //***************************************************************************************************************
224 vector<Sequence*> Pintail::readSeqs(string file) {
228 openInputFile(file, in);
229 vector<Sequence*> container;
231 //read in seqs and store in vector
233 Sequence* current = new Sequence(in);
235 //if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); }
236 //takes out stuff is needed
237 //current->setUnaligned(current->getUnaligned());
239 container.push_back(current);
248 catch(exception& e) {
249 errorOut(e, "Pintail", "readSeqs");
254 //***************************************************************************************************************
255 //num is query's spot in querySeqs
256 map<int, int> Pintail::trimSeqs(Sequence* query, Sequence subject, int num) {
259 string q = query->getAligned();
260 string s = subject.getAligned();
263 for (int i = 0; i < q.length(); i++) {
264 if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
268 for (int i = q.length(); i >= 0; i--) {
269 if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
272 //if num = -1 then you are calling this from quantiles
274 trimmed[num][front] = back;
283 catch(exception& e) {
284 errorOut(e, "Pintail", "trimSeqs");
289 //***************************************************************************************************************
291 vector<float> Pintail::readFreq() {
295 openInputFile(consfile, in);
299 //read in probabilities and store in vector
306 //do you want this spot
316 catch(exception& e) {
317 errorOut(e, "Pintail", "readFreq");
322 //***************************************************************************************************************
324 vector< vector<float> > Pintail::readQuantiles() {
328 openInputFile(quanfile, in);
330 vector< vector<float> > quan;
332 //read in probabilities and store in vector
333 int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
337 in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
342 temp.push_back(twentyfive);
343 temp.push_back(fifty);
344 temp.push_back(seventyfive);
345 temp.push_back(ninetyfive);
346 temp.push_back(ninetynine);
348 //do you want this spot
349 quan.push_back(temp);
358 catch(exception& e) {
359 errorOut(e, "Pintail", "readQuantiles");
365 //***************************************************************************************************************
366 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
367 vector<Sequence> Pintail::findPairs(int start, int end) {
370 vector<Sequence> seqsMatches; seqsMatches.resize(end-start);
372 for(int i = start; i < end; i++){
374 float smallest = 10000.0;
375 Sequence query = *(querySeqs[i]);
377 for(int j = 0; j < templateSeqs.size(); j++){
379 Sequence temp = *(templateSeqs[j]);
381 distcalculator->calcDist(query, temp);
382 float dist = distcalculator->getDist();
384 if (dist < smallest) {
385 seqsMatches[i] = *(templateSeqs[j]);
394 catch(exception& e) {
395 errorOut(e, "Pintail", "findPairs");
400 //***************************************************************************************************************
401 //find the window breaks for each sequence - this is so you can move ahead by bases.
402 vector<int> Pintail::findWindows(Sequence* query, int front, int back, int& size) {
407 int cutoff = back - front; //back - front
409 //if window is set to default
410 if (size == 0) { if (cutoff > 1200) { size = 300; }
411 else{ size = (cutoff / 4); } }
412 else if (size > (cutoff / 4)) {
413 mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
417 string seq = query->getAligned().substr(front, cutoff);
421 for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } }
424 win.push_back(front);
426 //move ahead increment bases at a time until all bases are in a window
428 int totalBases = 0; //used to eliminate window of blanks at end of sequence
430 seq = query->getAligned();
431 for (int m = front; m < (back - size) ; m++) {
433 //count number of bases you see
434 if (isalpha(seq[m])) { countBases++; totalBases++; }
436 //if you have seen enough bases to make a new window
437 if (countBases >= increment) {
438 win.push_back(m); //save spot in alignment
439 countBases = 0; //reset bases you've seen in this window
442 //no need to continue if all your bases are in a window
443 if (totalBases == numBases) { break; }
449 catch(exception& e) {
450 errorOut(e, "Pintail", "findWindows");
455 //***************************************************************************************************************
456 vector<float> Pintail::calcObserved(Sequence* query, Sequence subject, vector<int> window, int size) {
462 for (int m = 0; m < windows.size(); m++) {
464 string seqFrag = query->getAligned().substr(window[startpoint], size);
465 string seqFragsub = subject.getAligned().substr(window[startpoint], size);
468 for (int b = 0; b < seqFrag.length(); b++) {
469 if (seqFrag[b] != seqFragsub[b]) { diff++; }
472 //percentage of mismatched bases
474 dist = diff / (float) seqFrag.length() * 100;
476 temp.push_back(dist);
483 catch(exception& e) {
484 errorOut(e, "Pintail", "calcObserved");
488 //***************************************************************************************************************
489 float Pintail::calcDist(Sequence* query, Sequence subject, int front, int back) {
492 //so you only look at the trimmed part of the sequence
493 int cutoff = back - front;
495 //from first startpoint with length back-front
496 string seqFrag = query->getAligned().substr(front, cutoff);
497 string seqFragsub = subject.getAligned().substr(front, cutoff);
500 for (int b = 0; b < seqFrag.length(); b++) {
501 if (seqFrag[b] != seqFragsub[b]) { diff++; }
504 //percentage of mismatched bases
505 float dist = diff / (float) seqFrag.length() * 100;
509 catch(exception& e) {
510 errorOut(e, "Pintail", "calcDist");
515 //***************************************************************************************************************
516 vector<float> Pintail::calcExpected(vector<float> qav, float coef) {
520 vector<float> queryExpected;
522 for (int m = 0; m < qav.size(); m++) {
524 float expected = qav[m] * coef;
526 queryExpected.push_back(expected);
529 return queryExpected;
532 catch(exception& e) {
533 errorOut(e, "Pintail", "calcExpected");
537 //***************************************************************************************************************
538 float Pintail::calcDE(vector<float> obs, vector<float> exp) {
542 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
543 for (int m = 0; m < obsDistance.size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); }
545 float de = sqrt((sum / (obsDistance.size() - 1)));
549 catch(exception& e) {
550 errorOut(e, "Pintail", "calcDE");
555 //***************************************************************************************************************
557 vector<float> Pintail::calcFreq(vector<Sequence*> seqs) {
561 string freqfile = getRootName(templateFile) + "probability";
564 openOutputFile(freqfile, outFreq);
566 //at each position in the sequence
567 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
569 vector<int> freq; freq.resize(4,0);
572 //find the frequency of each nucleotide
573 for (int j = 0; j < seqs.size(); j++) {
575 char value = seqs[j]->getAligned()[i];
577 if(toupper(value) == 'A') { freq[0]++; }
578 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
579 else if(toupper(value) == 'G') { freq[2]++; }
580 else if(toupper(value) == 'C') { freq[3]++; }
584 //find base with highest frequency
586 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
589 if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; }
590 else { highFreq = highest / (float) (seqs.size() - gaps); }
591 //highFreq = highest / (float) seqs.size();
592 //cout << i << '\t' << highFreq << endl;
595 Pi = (highFreq - 0.25) / 0.75;
597 //cannot have probability less than 0.
598 if (Pi < 0) { Pi = 0.0; }
600 //saves this for later
601 outFreq << i+1 << '\t' << Pi << endl;
611 catch(exception& e) {
612 errorOut(e, "Pintail", "calcFreq");
616 //***************************************************************************************************************
617 vector<float> Pintail::findQav(vector<int> window, int size) {
619 vector<float> averages;
621 //for each window find average
622 for (int m = 0; m < windows.size(); m++) {
626 //while you are in the window for this sequence
627 for (int j = window[m]; j < (window[m]+size); j++) { average += probabilityProfile[j]; }
629 average = average / size;
631 //save this windows average
632 averages.push_back(average);
637 catch(exception& e) {
638 errorOut(e, "Pintail", "findQav");
643 //***************************************************************************************************************
644 vector< vector<float> > Pintail::getQuantiles(int start, int end) {
646 vector< vector<float> > quan;
649 for(int i = start; i < end; i++){
651 Sequence* query = templateSeqs[i];
653 //compare to every other sequence in template
654 for(int j = 0; j < i; j++){
656 Sequence subject = *(templateSeqs[j]);
658 map<int, int> trim = trimSeqs(query, subject, -1);
672 catch(exception& e) {
673 errorOut(e, "Pintail", "findQav");
678 //***************************************************************************************************************
679 float Pintail::getCoef(vector<float> obs, vector<float> qav) {
682 //find average prob for this seqs windows
683 float probAverage = 0.0;
684 for (int j = 0; j < qav.size(); j++) { probAverage += qav[j]; }
685 probAverage = probAverage / (float) Qav.size();
687 //find observed average
688 float obsAverage = 0.0;
689 for (int j = 0; j < obs.size(); j++) { obsAverage += obs[j]; }
690 obsAverage = obsAverage / (float) obs.size();
693 float coef = obsAverage / probAverage;
697 catch(exception& e) {
698 errorOut(e, "Pintail", "getCoef");
702 /**************************************************************************************************/
704 void Pintail::createProcessesSpots() {
706 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
708 vector<int> processIDS;
709 vector< vector<int> > win; win.resize(querySeqs.size());
711 //loop through and create all the processes you want
712 while (process != processors) {
716 processIDS.push_back(pid);
720 vector<Sequence> tempbest;
721 tempbest = findPairs(lines[process]->start, lines[process]->end);
723 for (int i = lines[process]->start; i < lines[process]->end; i++) {
724 bestfit[i] = tempbest[count];
726 //chops off beginning and end of sequences so they both start and end with a base
727 trimSeqs(querySeqs[i], bestfit[i], i);
732 for (int i = lines[process]->start; i < lines[process]->end; i++) {
733 vector<int> temp = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]);
738 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
741 //force parent to wait until all the processes are done
742 for (int i=0;i<processors;i++) {
743 int temp = processIDS[i];
749 bestfit = findPairs(lines[0]->start, lines[0]->end);
750 for (int j = 0; j < bestfit.size(); j++) {
751 //chops off beginning and end of sequences so they both start and end with a base
752 trimSeqs(querySeqs[j], bestfit[j], j);
755 for (int i = lines[0]->start; i < lines[0]->end; i++) {
756 it = trimmed[i].begin();
757 map<int, int> win = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]);
763 catch(exception& e) {
764 errorOut(e, "Pintail", "createProcessesSpots");
770 /**************************************************************************************************/
772 void Pintail::createProcesses() {
774 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
776 vector<int> processIDS;
778 vector< vector<float> > exp; exp.resize(querySeqs.size());
779 vector<float> de; de.resize(querySeqs.size());
780 vector< vector<float> > obs; obs.resize(querySeqs.size());
781 vector<float> dev; dev.resize(querySeqs.size());
784 //loop through and create all the processes you want
785 while (process != processors) {
789 processIDS.push_back(pid);
794 for (int i = lines[process]->start; i < lines[process]->end; i++) {
795 vector<float> obsi = calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]);
799 vector<float> q = findQav(windows[i], windowSizes[i]);
802 float alpha = getCoef(obsDistance[i], q);
805 vector<float> exp = calcExpected(q, alpha);
806 expectedDistance[i] = exp;
808 //get de and deviation
809 float dei = calcDE(obsi, exp);
812 it = trimmed[i].begin();
813 float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second);
818 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
821 //force parent to wait until all the processes are done
822 for (int i=0;i<processors;i++) {
823 int temp = processIDS[i];
828 expectedDistance = exp;
833 mothurOut("Calculating observed distance... "); cout.flush();
834 for (int i = lines[0]->start; i < lines[0]->end; i++) {
835 vector<float> obsi = calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]);
836 obsDistance[i] = obsi;
838 mothurOut("Done."); mothurOutEndLine();
842 mothurOut("Finding variability... "); cout.flush();
843 for (int i = lines[0]->start; i < lines[0]->end; i++) {
844 vector<float> q = findQav(windows[i], windowSizes[i]);
847 mothurOut("Done."); mothurOutEndLine();
851 mothurOut("Calculating alpha... "); cout.flush();
852 for (int i = lines[0]->start; i < lines[0]->end; i++) {
853 float alpha = getCoef(obsDistance[i], Qav[i]);
854 seqCoef.push_back(alpha);
856 mothurOut("Done."); mothurOutEndLine();
860 mothurOut("Calculating expected distance... "); cout.flush();
861 for (int i = lines[0]->start; i < lines[0]->end; i++) {
862 vector<float> exp = calcExpected(Qav[i], seqCoef[i]);
863 expectedDistance[i] = exp;
865 mothurOut("Done."); mothurOutEndLine();
869 mothurOut("Finding deviation... "); cout.flush();
870 for (int i = lines[0]->start; i < lines[0]->end; i++) {
871 float de = calcDE(obsDistance[i], expectedDistance[i]);
874 it = trimmed[i].begin();
875 float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second);
878 mothurOut("Done."); mothurOutEndLine();
882 catch(exception& e) {
883 errorOut(e, "Pintail", "createProcesses");
889 /**************************************************************************************************/
891 void Pintail::createProcessesQuan() {
893 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
895 vector<int> processIDS;
897 //loop through and create all the processes you want
898 while (process != processors) {
902 processIDS.push_back(pid);
908 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
911 //force parent to wait until all the processes are done
912 for (int i=0;i<processors;i++) {
913 int temp = processIDS[i];
921 catch(exception& e) {
922 errorOut(e, "Pintail", "createProcessesQuan");
928 //***************************************************************************************************************