5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "eachgapdist.h"
13 //***************************************************************************************************************
15 Pintail::Pintail(string filename, string temp) { fastafile = filename; templateFile = temp; }
16 //***************************************************************************************************************
20 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
21 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
24 errorOut(e, "Pintail", "~Pintail");
28 //***************************************************************************************************************
29 void Pintail::print(ostream& out) {
32 for (int i = 0; i < querySeqs.size(); i++) {
34 out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << endl;
37 for (int j = 0; j < obsDistance[i].size(); j++) { out << obsDistance[i][j] << '\t'; }
42 for (int m = 0; m < expectedDistance[i].size(); m++) { out << expectedDistance[i][m] << '\t'; }
48 errorOut(e, "Pintail", "print");
53 //***************************************************************************************************************
54 void Pintail::getChimeras() {
57 //read in query sequences and subject sequences
58 mothurOut("Reading sequences and template file... "); cout.flush();
59 querySeqs = readSeqs(fastafile);
60 templateSeqs = readSeqs(templateFile);
61 mothurOut("Done."); mothurOutEndLine();
63 int numSeqs = querySeqs.size();
65 obsDistance.resize(numSeqs);
66 expectedDistance.resize(numSeqs);
67 seqCoef.resize(numSeqs);
70 bestfit.resize(numSeqs);
72 deviation.resize(numSeqs);
73 windowSizes.resize(numSeqs, window);
75 //break up file if needed
76 int linesPerProcess = processors / numSeqs;
78 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
79 //find breakup of sequences for all times we will Parallelize
80 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
83 for (int i = 0; i < (processors-1); i++) {
84 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
86 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
87 int i = processors - 1;
88 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
91 lines.push_back(new linePair(0, numSeqs));
94 distcalculator = new eachGapDist();
96 if (processors == 1) {
97 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
98 bestfit = findPairs(lines[0]->start, lines[0]->end);
99 for (int m = 0; m < templateSeqs.size(); m++) {
100 if (templateSeqs[m]->getName() == "198806") { bestfit[0] = *(templateSeqs[m]); }
101 if (templateSeqs[m]->getName() == "198806") { bestfit[1] = *(templateSeqs[m]); }
102 if (templateSeqs[m]->getName() == "108139") { bestfit[2] = *(templateSeqs[m]); }
105 for (int j = 0; j < bestfit.size(); j++) {//cout << querySeqs[j]->getName() << '\t' << "length = " << querySeqs[j]->getAligned().length() << '\t' << bestfit[j].getName() << " length = " << bestfit[j].getAligned().length() << endl;
106 //chops off beginning and end of sequences so they both start and end with a base
107 trimSeqs(querySeqs[j], bestfit[j], j);
108 //cout << "NEW SEQ PAIR" << querySeqs[j]->getAligned() << endl << "IN THE MIDDLE" << endl << bestfit[j].getAligned() << endl;
112 mothurOut("Done."); mothurOutEndLine();
114 windows = findWindows(lines[0]->start, lines[0]->end);
115 } else { createProcessesSpots(); }
118 if (consfile == "") { probabilityProfile = calcFreq(templateSeqs); }
119 else { probabilityProfile = readFreq(); }
122 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; }
124 if (processors == 1) {
126 mothurOut("Calculating observed distance... "); cout.flush();
127 obsDistance = calcObserved(lines[0]->start, lines[0]->end);
128 mothurOut("Done."); mothurOutEndLine();
130 mothurOut("Finding variability... "); cout.flush();
131 Qav = findQav(lines[0]->start, lines[0]->end);
132 for (int i = 0; i < Qav.size(); i++) {
133 cout << querySeqs[i]->getName() << " = ";
134 for (int u = 0; u < Qav[i].size();u++) { cout << Qav[i][u] << '\t'; }
135 cout << endl << endl;
139 mothurOut("Done."); mothurOutEndLine();
141 mothurOut("Calculating alpha... "); cout.flush();
142 seqCoef = getCoef(lines[0]->start, lines[0]->end);
143 for (int i = 0; i < seqCoef.size(); i++) {
144 cout << querySeqs[i]->getName() << " coef = " << seqCoef[i] << endl;
147 mothurOut("Done."); mothurOutEndLine();
149 mothurOut("Calculating expected distance... "); cout.flush();
150 expectedDistance = calcExpected(lines[0]->start, lines[0]->end);
151 mothurOut("Done."); mothurOutEndLine();
153 mothurOut("Finding deviation... "); cout.flush();
154 DE = calcDE(lines[0]->start, lines[0]->end);
155 deviation = calcDist(lines[0]->start, lines[0]->end);
156 mothurOut("Done."); mothurOutEndLine();
161 else { createProcesses(); }
163 delete distcalculator;
166 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
170 catch(exception& e) {
171 errorOut(e, "Pintail", "getChimeras");
176 //***************************************************************************************************************
178 vector<Sequence*> Pintail::readSeqs(string file) {
182 openInputFile(file, in);
183 vector<Sequence*> container;
185 //read in seqs and store in vector
187 Sequence* current = new Sequence(in);
189 if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); }
190 //takes out stuff is needed
191 current->setUnaligned(current->getUnaligned());
193 container.push_back(current);
202 catch(exception& e) {
203 errorOut(e, "Pintail", "readSeqs");
208 //***************************************************************************************************************
209 //num is query's spot in querySeqs
210 void Pintail::trimSeqs(Sequence* query, Sequence& subject, int num) {
213 string q = query->getAligned();
214 string s = subject.getAligned();
217 for (int i = 0; i < q.length(); i++) {
218 if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
221 q = q.substr(front, q.length());
222 s = s.substr(front, s.length());
225 for (int i = q.length(); i >= 0; i--) {
226 if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
229 q = q.substr(0, back);
230 s = s.substr(0, back);
232 trim[num][front] = back;
235 query->setAligned(q);
236 query->setUnaligned(q);
237 subject.setAligned(s);
238 subject.setUnaligned(s);
240 catch(exception& e) {
241 errorOut(e, "Pintail", "trimSeqs");
246 //***************************************************************************************************************
248 vector<float> Pintail::readFreq() {
252 openInputFile(consfile, in);
256 //read in probabilities and store in vector
272 catch(exception& e) {
273 errorOut(e, "Pintail", "readFreq");
278 //***************************************************************************************************************
279 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
280 vector<Sequence> Pintail::findPairs(int start, int end) {
283 vector<Sequence> seqsMatches; seqsMatches.resize(end-start);
285 for(int i = start; i < end; i++){
287 float smallest = 10000.0;
288 Sequence query = *(querySeqs[i]);
290 for(int j = 0; j < templateSeqs.size(); j++){
292 Sequence temp = *(templateSeqs[j]);
294 distcalculator->calcDist(query, temp);
295 float dist = distcalculator->getDist();
297 if (dist < smallest) {
298 seqsMatches[i] = *(templateSeqs[j]);
307 catch(exception& e) {
308 errorOut(e, "Pintail", "findPairs");
313 //***************************************************************************************************************
314 //find the window breaks for each sequence
315 vector< vector<int> > Pintail::findWindows(int start, int end) {
318 vector< vector<int> > win; win.resize(end-start);
322 for(int i = start; i < end; i++){
324 //if window is set to default
325 if (windowSizes[i] == 0) { if (querySeqs[i]->getAligned().length() > 1200) { windowSizes[i] = 300; }
326 else{ windowSizes[i] = (querySeqs[i]->getAligned().length() / 4); } }
327 else if (windowSizes[i] > (querySeqs[i]->getAligned().length() / 4)) {
328 mothurOut("You have selected to large a window size for sequence " + querySeqs[i]->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
329 windowSizes[i] = (querySeqs[i]->getAligned().length() / 4);
332 //cout << "length = " << querySeqs[i]->getAligned().length() << " window = " << windowSizes[i] << " increment = " << increment << endl;
335 string seq = querySeqs[i]->getAligned();
336 int numBases = querySeqs[i]->getUnaligned().length();
339 //find location of first base
340 for (int j = 0; j < seq.length(); j++) {
341 if (isalpha(seq[j])) { spot = j; break; }
345 win[count].push_back(spot);
348 //move ahead increment bases at a time until all bases are in a window
350 int totalBases = 0; //used to eliminate window of blanks at end of sequence
351 for (int m = spot; m < seq.length(); m++) {
353 //count number of bases you see
354 if (isalpha(seq[m])) { countBases++; totalBases++; }
356 //if you have seen enough bases to make a new window
357 if (countBases >= increment) {
358 win[count].push_back(m); //save spot in alignment
359 countBases = 0; //reset bases you've seen in this window
362 //no need to continue if all your bases are in a window
363 if (totalBases == numBases) { break; }
374 catch(exception& e) {
375 errorOut(e, "Pintail", "findWindows");
380 //***************************************************************************************************************
381 vector< vector<float> > Pintail::calcObserved(int start, int end) {
384 vector< vector<float> > temp;
385 temp.resize(end-start);
388 for(int i = start; i < end; i++){
390 Sequence* query = querySeqs[i];
391 Sequence subject = bestfit[i];
394 for (int m = 0; m < windows[i].size(); m++) {
396 string seqFrag = query->getAligned().substr(windows[i][startpoint], windowSizes[i]);
397 string seqFragsub = subject.getAligned().substr(windows[i][startpoint], windowSizes[i]);
400 for (int b = 0; b < seqFrag.length(); b++) {
402 //if either the query or subject is not a gap
403 if ((isalpha(seqFrag[b])) || (isalpha(seqFragsub[b]))) {
404 //and they are different - penalize
405 if (seqFrag[b] != seqFragsub[b]) { diff++; }
409 //percentage of mismatched bases
411 dist = diff / (float) seqFrag.length() * 100;
413 temp[count].push_back(dist);
423 catch(exception& e) {
424 errorOut(e, "Pintail", "calcObserved");
428 //***************************************************************************************************************
429 vector<float> Pintail::calcDist(int start, int end) {
434 for(int i = start; i < end; i++){
436 Sequence* query = querySeqs[i];
437 Sequence subject = bestfit[i];
439 string seqFrag = query->getAligned();
440 string seqFragsub = subject.getAligned();
443 for (int b = 0; b < seqFrag.length(); b++) {
445 //if either the query or subject is not a gap
446 if ((isalpha(seqFrag[b])) || (isalpha(seqFragsub[b]))) {
447 //and they are different - penalize
448 if (seqFrag[b] != seqFragsub[b]) { diff++; }
452 //percentage of mismatched bases
454 dist = diff / (float) seqFrag.length() * 100;
456 temp.push_back(dist);
461 catch(exception& e) {
462 errorOut(e, "Pintail", "calcDist");
467 //***************************************************************************************************************
468 vector< vector<float> > Pintail::calcExpected(int start, int end) {
471 vector< vector<float> > temp; temp.resize(end-start);
475 for(int i = start; i < end; i++){
477 float coef = seqCoef[i];
480 vector<float> queryExpected;
481 for (int m = 0; m < windows[i].size(); m++) {
482 float expected = Qav[i][m] * coef;
483 queryExpected.push_back(expected);
484 //cout << "average variabilty over window = " << averageProbability[m] << " coef = " << coef << " ei = " << expected << '\t' << "window = " << m << endl;
487 temp[count] = queryExpected;
496 catch(exception& e) {
497 errorOut(e, "Pintail", "calcExpected");
501 //***************************************************************************************************************
502 vector<float> Pintail::calcDE(int start, int end) {
505 vector<float> temp; temp.resize(end-start);
509 for(int i = start; i < end; i++){
511 vector<float> obs = obsDistance[i];
512 vector<float> exp = expectedDistance[i];
514 // cout << "difference between obs and exp = " << abs(obs[m] - exp[m]) << endl;
516 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
517 for (int m = 0; m < windows[i].size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); }
519 float de = sqrt((sum / (windows[i].size() - 1)));
527 catch(exception& e) {
528 errorOut(e, "Pintail", "calcDE");
533 //***************************************************************************************************************
535 vector<float> Pintail::calcFreq(vector<Sequence*> seqs) {
539 string freqfile = getRootName(templateFile) + "probability";
542 openOutputFile(freqfile, outFreq);
544 //at each position in the sequence
545 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
547 vector<int> freq; freq.resize(4,0);
550 //find the frequency of each nucleotide
551 for (int j = 0; j < seqs.size(); j++) {
553 char value = seqs[j]->getAligned()[i];
555 if(toupper(value) == 'A') { freq[0]++; }
556 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
557 else if(toupper(value) == 'G') { freq[2]++; }
558 else if(toupper(value) == 'C') { freq[3]++; }
562 //find base with highest frequency
564 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
567 //if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; }
568 //else { highFreq = highest / (float) (seqs.size() - gaps); }
569 highFreq = highest / (float) seqs.size();
570 cout << i << '\t' << highFreq << endl;
573 Pi = (highFreq - 0.25) / 0.75;
575 //saves this for later
576 outFreq << i << '\t' << Pi << endl;
586 catch(exception& e) {
587 errorOut(e, "Pintail", "calcFreq");
591 //***************************************************************************************************************
592 vector< vector<float> > Pintail::findQav(int start, int end) {
594 vector< vector<float> > averages;
595 map<int, int>::iterator it;
597 for(int i = start; i < end; i++){
599 //for each window find average
601 for (int m = 0; m < windows[i].size(); m++) {
605 it = trim[i].begin(); //trim[i] is a map of where this sequence was trimmed
607 //while you are in the window for this sequence
608 for (int j = windows[i][m]+it->first; j < (windows[i][m]+windowSizes[i]); j++) { average += probabilityProfile[j]; }
610 average = average / windowSizes[i];
611 //cout << average << endl;
612 //save this windows average
613 temp.push_back(average);
617 averages.push_back(temp);
622 catch(exception& e) {
623 errorOut(e, "Pintail", "findQav");
627 //***************************************************************************************************************
628 vector<float> Pintail::getCoef(int start, int end) {
631 coefs.resize(end-start);
633 //find a coef for each sequence
635 for(int i = start; i < end; i++){
637 //find average prob for this seqs windows
638 float probAverage = 0.0;
639 for (int j = 0; j < Qav[i].size(); j++) { probAverage += Qav[i][j]; }
640 probAverage = probAverage / (float) Qav[i].size();
641 cout << "(sum of ai) / m = " << probAverage << endl;
643 vector<float> temp = obsDistance[i];
645 //find observed average
646 float obsAverage = 0.0;
647 for (int j = 0; j < temp.size(); j++) { obsAverage += temp[j]; }
648 obsAverage = obsAverage / (float) temp.size();
649 cout << "(sum of oi) / m = " << obsAverage << endl;
650 float coef = obsAverage / probAverage;
652 //save this sequences coefficient
661 catch(exception& e) {
662 errorOut(e, "Pintail", "getCoef");
668 /**************************************************************************************************/
670 void Pintail::createProcessesSpots() {
672 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
674 vector<int> processIDS;
675 vector< vector<int> > win; win.resize(querySeqs.size());
676 vector< map <int, int> > t; t.resize(querySeqs.size());
678 //loop through and create all the processes you want
679 while (process != processors) {
683 processIDS.push_back(pid);
687 vector<Sequence> tempbest;
688 tempbest = findPairs(lines[process]->start, lines[process]->end);
690 for (int i = lines[process]->start; i < lines[process]->end; i++) {
691 bestfit[i] = tempbest[count];
693 //chops off beginning and end of sequences so they both start and end with a base
694 trimSeqs(querySeqs[i], bestfit[i], i);
702 vector< vector<int> > temp = findWindows(lines[process]->start, lines[process]->end);
706 for (int i = lines[process]->start; i < lines[process]->end; i++) {
707 win[i] = temp[count];
712 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
715 //force parent to wait until all the processes are done
716 for (int i=0;i<processors;i++) {
717 int temp = processIDS[i];
724 windows = findWindows(lines[0]->start, lines[0]->end);
728 catch(exception& e) {
729 errorOut(e, "Pintail", "createProcessesSpots");
735 /**************************************************************************************************/
737 void Pintail::createProcesses() {
739 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
741 vector<int> processIDS;
743 vector< vector<float> > exp; exp.resize(querySeqs.size());
744 vector<float> de; de.resize(querySeqs.size());
745 vector< vector<float> > obs; obs.resize(querySeqs.size());
748 //loop through and create all the processes you want
749 while (process != processors) {
753 processIDS.push_back(pid);
757 vector< vector<float> > temp;
758 vector<float> tempde;
762 temp = calcObserved(lines[process]->start, lines[process]->end);
764 for (int i = lines[process]->start; i < lines[process]->end; i++) {
765 obs[i] = temp[count];
769 temp = findQav(lines[process]->start, lines[process]->end);
771 for (int i = lines[process]->start; i < lines[process]->end; i++) {
772 Qav[i] = temp[count];
776 tempde = getCoef(lines[process]->start, lines[process]->end);
778 for (int i = lines[process]->start; i < lines[process]->end; i++) {
779 seqCoef[i] = tempde[count];
783 temp = calcExpected(lines[process]->start, lines[process]->end);
785 for (int i = lines[process]->start; i < lines[process]->end; i++) {
786 exp[i] = temp[count];
791 tempde = calcDE(lines[process]->start, lines[process]->end);
793 for (int i = lines[process]->start; i < lines[process]->end; i++) {
794 de[i] = tempde[count];
799 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
802 //force parent to wait until all the processes are done
803 for (int i=0;i<processors;i++) {
804 int temp = processIDS[i];
809 expectedDistance = exp;
813 bestfit = findPairs(lines[0]->start, lines[0]->end);
814 obsDistance = calcObserved(lines[0]->start, lines[0]->end);
815 Qav = findQav(lines[0]->start, lines[0]->end);
816 seqCoef = getCoef(lines[0]->start, lines[0]->end);
817 expectedDistance = calcExpected(lines[0]->start, lines[0]->end);
818 DE = calcDE(lines[0]->start, lines[0]->end);
822 catch(exception& e) {
823 errorOut(e, "Pintail", "createProcesses");
828 //***************************************************************************************************************