5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
13 //***************************************************************************************************************
15 Pintail::Pintail(string name) {
20 errorOut(e, "Pintail", "Pintail");
24 //***************************************************************************************************************
28 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
29 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
32 errorOut(e, "Pintail", "~Pintail");
36 //***************************************************************************************************************
37 void Pintail::print(ostream& out) {
40 for (itCoef = DE.begin(); itCoef != DE.end(); itCoef++) {
42 out << itCoef->first->getName() << '\t' << itCoef->second << endl;
45 itObsDist = obsDistance.find(itCoef->first);
46 for (int i = 0; i < itObsDist->second.size(); i++) { out << itObsDist->second[i] << '\t'; }
51 itExpDist = expectedDistance.find(itCoef->first);
52 for (int i = 0; i < itExpDist->second.size(); i++) { out << itExpDist->second[i] << '\t'; }
59 errorOut(e, "Pintail", "print");
64 //***************************************************************************************************************
65 void Pintail::getChimeras() {
68 distCalculator = new ignoreGaps();
70 //read in query sequences and subject sequences
71 mothurOut("Reading sequences and template file... "); cout.flush();
72 querySeqs = readSeqs(fastafile);
73 templateSeqs = readSeqs(templateFile);
74 mothurOut("Done."); mothurOutEndLine();
76 int numSeqs = querySeqs.size();
78 //if window is set to default
79 if (window == 0) { if (querySeqs[0]->getAligned().length() > 800) { setWindow(200); }
80 else{ setWindow((querySeqs[0]->getAligned().length() / 4)); } }
82 //calculate number of iters
83 iters = (querySeqs[0]->getAligned().length() - window + 1) / increment;
85 int linesPerProcess = processors / numSeqs;
87 //find breakup of sequences for all times we will Parallelize
88 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
91 for (int i = 0; i < (processors-1); i++) {
92 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
94 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
95 int i = processors - 1;
96 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
99 //map query sequences to their most similiar sequences in the template - Parallelized
100 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
101 if (processors == 1) { findPairs(lines[0]->start, lines[0]->end); }
102 else { createProcessesPairs(); }
103 mothurOut("Done."); mothurOutEndLine();
104 // itBest = bestfit.begin();
105 // itBest++; itBest++;
106 //cout << itBest->first->getName() << '\t' << itBest->second->getName() << endl;
107 //find Oqs for each sequence - the observed distance in each window - Parallelized
108 mothurOut("Calculating observed percentage differences for each sequence... "); cout.flush();
109 if (processors == 1) { calcObserved(lines[0]->start, lines[0]->end); }
110 else { createProcessesObserved(); }
111 mothurOut("Done."); mothurOutEndLine();
114 mothurOut("Calculating expected percentage differences for each sequence... "); cout.flush();
115 vector<float> probabilityProfile = calcFreq(templateSeqs);
118 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; }
119 //cout << "after p into Q" << endl;
121 averageProbability = findQav(probabilityProfile);
122 //cout << "after Qav" << endl;
123 //find Coefficient - maps a sequence to its coefficient
124 seqCoef = getCoef(averageProbability);
125 //cout << "after coef" << endl;
126 //find Eqs for each sequence - the expected distance in each window - Parallelized
127 if (processors == 1) { calcExpected(lines[0]->start, lines[0]->end); }
128 else { createProcessesExpected(); }
129 mothurOut("Done."); mothurOutEndLine();
131 //find deviation - Parallelized
132 mothurOut("Finding deviation from expected... "); cout.flush();
133 if (processors == 1) { calcDE(lines[0]->start, lines[0]->end); }
134 else { createProcessesDE(); }
135 mothurOut("Done."); mothurOutEndLine();
139 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
140 delete distCalculator;
143 catch(exception& e) {
144 errorOut(e, "Pintail", "getChimeras");
149 //***************************************************************************************************************
151 vector<Sequence*> Pintail::readSeqs(string file) {
155 openInputFile(file, in);
156 vector<Sequence*> container;
158 //read in seqs and store in vector
160 Sequence* current = new Sequence(in);
162 if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); }
164 container.push_back(current);
173 catch(exception& e) {
174 errorOut(e, "Pintail", "readSeqs");
179 //***************************************************************************************************************
180 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
181 void Pintail::findPairs(int start, int end) {
184 for(int i = start; i < end; i++){
186 float smallest = 10000.0;
187 Sequence query = *(querySeqs[i]);
189 for(int j = 0; j < templateSeqs.size(); j++){
191 Sequence temp = *(templateSeqs[j]);
193 distCalculator->calcDist(query, temp);
194 float dist = distCalculator->getDist();
196 if (dist < smallest) {
198 bestfit[querySeqs[i]] = templateSeqs[j];
205 catch(exception& e) {
206 errorOut(e, "Pintail", "findPairs");
210 //***************************************************************************************************************
211 void Pintail::calcObserved(int start, int end) {
214 for(int i = start; i < end; i++){
216 itBest = bestfit.find(querySeqs[i]);
220 if (itBest != bestfit.end()) {
221 query = itBest->first;
222 subject = itBest->second;
223 }else{ mothurOut("Error in calcObserved"); mothurOutEndLine(); }
225 vector<Sequence*> queryFragment;
226 vector<Sequence*> subjectFragment;
229 for (int m = 0; m < iters; m++) {
230 string seqFrag = query->getAligned().substr(startpoint, window);
231 Sequence* temp = new Sequence(query->getName(), seqFrag);
232 queryFragment.push_back(temp);
234 string seqFragsub = subject->getAligned().substr(startpoint, window);
235 Sequence* tempsub = new Sequence(subject->getName(), seqFragsub);
236 subjectFragment.push_back(tempsub);
238 startpoint += increment;
242 for(int j = 0; j < iters; j++){
244 distCalculator->calcDist(*(queryFragment[j]), *(subjectFragment[j]));
245 float dist = distCalculator->getDist();
247 //convert to similiarity
251 obsDistance[query].push_back(dist);
255 for (int i = 0; i < queryFragment.size(); i++) { delete queryFragment[i]; }
256 for (int i = 0; i < subjectFragment.size(); i++) { delete subjectFragment[i]; }
260 catch(exception& e) {
261 errorOut(e, "Pintail", "calcObserved");
266 //***************************************************************************************************************
267 void Pintail::calcExpected(int start, int end) {
272 for(int i = start; i < end; i++){
274 itCoef = seqCoef.find(querySeqs[i]);
275 float coef = itCoef->second;
278 vector<float> queryExpected;
279 for (int m = 0; m < iters; m++) {
280 float expected = averageProbability[m] * coef;
281 queryExpected.push_back(expected);
284 expectedDistance[querySeqs[i]] = queryExpected;
289 catch(exception& e) {
290 errorOut(e, "Pintail", "calcExpected");
294 //***************************************************************************************************************
295 void Pintail::calcDE(int start, int end) {
300 for(int i = start; i < end; i++){
302 itObsDist = obsDistance.find(querySeqs[i]);
303 vector<float> obs = itObsDist->second;
305 itExpDist = expectedDistance.find(querySeqs[i]);
306 vector<float> exp = itExpDist->second;
309 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
310 for (int m = 0; m < iters; m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); }
312 float de = sqrt((sum / (iters - 1)));
314 DE[querySeqs[i]] = de;
318 catch(exception& e) {
319 errorOut(e, "Pintail", "calcDE");
324 //***************************************************************************************************************
326 vector<float> Pintail::calcFreq(vector<Sequence*> seqs) {
331 //at each position in the sequence
332 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
334 vector<int> freq; freq.resize(4,0);
336 //find the frequency of each nucleotide
337 for (int j = 0; j < seqs.size(); j++) {
339 char value = seqs[j]->getAligned()[i];
341 if(toupper(value) == 'A') { freq[0]++; }
342 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
343 else if(toupper(value) == 'G') { freq[2]++; }
344 else if(toupper(value) == 'C') { freq[3]++; }
347 //find base with highest frequency
349 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
351 float highFreq = highest / (float) seqs.size();
353 //if this is not a gap column
354 if (highFreq != 0) { Pi = (highFreq - 0.25) / 0.75; }
364 catch(exception& e) {
365 errorOut(e, "Pintail", "calcFreq");
369 //***************************************************************************************************************
370 vector<float> Pintail::findQav(vector<float> prob) {
372 vector<float> averages;
374 //for each window find average
376 for (int m = 0; m < iters; m++) {
379 for (int i = startpoint; i < (startpoint+window); i++) { average += prob[i]; }
381 average = average / window;
383 //save this windows average
384 averages.push_back(average);
386 startpoint += increment;
391 catch(exception& e) {
392 errorOut(e, "Pintail", "findQav");
396 //***************************************************************************************************************
397 map<Sequence*, float> Pintail::getCoef(vector<float> prob) {
399 map<Sequence*, float> coefs;
402 float probAverage = 0.0;
403 for (int i = 0; i < prob.size(); i++) { probAverage += prob[i]; }
404 probAverage = probAverage / (float) prob.size();
406 //find a coef for each sequence
407 map<Sequence*, vector<float> >::iterator it;
408 for (it = obsDistance.begin(); it != obsDistance.end(); it++) {
410 vector<float> temp = it->second;
411 Sequence* tempSeq = it->first;
413 //find observed average
414 float obsAverage = 0.0;
415 for (int i = 0; i < temp.size(); i++) { obsAverage += temp[i]; }
416 obsAverage = obsAverage / (float) temp.size();
418 float coef = obsAverage / probAverage;
420 //save this sequences coefficient
421 coefs[tempSeq] = coef;
427 catch(exception& e) {
428 errorOut(e, "Pintail", "getCoef");
433 /**************************************************************************************************/
435 void Pintail::createProcessesPairs() {
437 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
439 vector<int> processIDS;
441 //loop through and create all the processes you want
442 while (process != processors) {
446 processIDS.push_back(pid);
449 findPairs(lines[process]->start, lines[process]->end);
451 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
454 //force parent to wait until all the processes are done
455 for (int i=0;i<processors;i++) {
456 int temp = processIDS[i];
461 findPairs(lines[0]->start, lines[0]->end);
465 catch(exception& e) {
466 errorOut(e, "Pintail", "createProcessesPairs");
471 /**************************************************************************************************/
473 void Pintail::createProcessesObserved() {
475 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
477 vector<int> processIDS;
479 //loop through and create all the processes you want
480 while (process != processors) {
484 processIDS.push_back(pid);
487 calcObserved(lines[process]->start, lines[process]->end);
489 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
492 //force parent to wait until all the processes are done
493 for (int i=0;i<processors;i++) {
494 int temp = processIDS[i];
499 calcObserved(lines[0]->start, lines[0]->end);
503 catch(exception& e) {
504 errorOut(e, "Pintail", "createProcessesObserved");
509 //***************************************************************************************************************
511 void Pintail::createProcessesExpected() {
513 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
515 vector<int> processIDS;
517 //loop through and create all the processes you want
518 while (process != processors) {
522 processIDS.push_back(pid);
525 calcExpected(lines[process]->start, lines[process]->end);
527 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
530 //force parent to wait until all the processes are done
531 for (int i=0;i<processors;i++) {
532 int temp = processIDS[i];
537 calcExpected(lines[0]->start, lines[0]->end);
541 catch(exception& e) {
542 errorOut(e, "Pintail", "createProcessesExpected");
547 /**************************************************************************************************/
549 void Pintail::createProcessesDE() {
551 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
553 vector<int> processIDS;
555 //loop through and create all the processes you want
556 while (process != processors) {
560 processIDS.push_back(pid);
563 calcDE(lines[process]->start, lines[process]->end);
565 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
568 //force parent to wait until all the processes are done
569 for (int i=0;i<processors;i++) {
570 int temp = processIDS[i];
575 calcDE(lines[0]->start, lines[0]->end);
579 catch(exception& e) {
580 errorOut(e, "Pintail", "createProcessesDE");
585 //***************************************************************************************************************