5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
11 #include "ignoregaps.h"
13 //***************************************************************************************************************
15 Pintail::Pintail(string name) {
20 errorOut(e, "Pintail", "Pintail");
24 //***************************************************************************************************************
28 for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
29 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
32 errorOut(e, "Pintail", "~Pintail");
36 //***************************************************************************************************************
37 void Pintail::print(ostream& out) {
40 for (itCoef = DE.begin(); itCoef != DE.end(); itCoef++) {
42 out << itCoef->first->getName() << '\t' << itCoef->second << endl;
45 itObsDist = obsDistance.find(itCoef->first);
46 for (int i = 0; i < itObsDist->second.size(); i++) { out << itObsDist->second[i] << '\t'; }
51 itExpDist = expectedDistance.find(itCoef->first);
52 for (int i = 0; i < itExpDist->second.size(); i++) { out << itExpDist->second[i] << '\t'; }
60 errorOut(e, "Pintail", "print");
65 //***************************************************************************************************************
66 void Pintail::getChimeras() {
69 distCalculator = new ignoreGaps();
71 //read in query sequences and subject sequences
72 mothurOut("Reading sequences and template file... "); cout.flush();
73 querySeqs = readSeqs(fastafile);
74 templateSeqs = readSeqs(templateFile);
75 mothurOut("Done."); mothurOutEndLine();
77 int numSeqs = querySeqs.size();
79 //if window is set to default
80 if (window == 0) { if (querySeqs[0]->getAligned().length() > 800) { setWindow(200); }
81 else{ setWindow((querySeqs[0]->getAligned().length() / 4)); } }
82 else if (window > (querySeqs[0]->getAligned().length() / 4)) {
83 mothurOut("You have selected to large a window size for you sequences. I will choose a smaller window."); mothurOutEndLine();
84 setWindow((querySeqs[0]->getAligned().length() / 4));
87 //calculate number of iters
88 iters = (querySeqs[0]->getAligned().length() - window + 1) / increment;
89 cout << "length = " << querySeqs[0]->getAligned().length() << " window = " << window << " increment = " << increment << " iters = " << iters << endl;
90 int linesPerProcess = processors / numSeqs;
92 //find breakup of sequences for all times we will Parallelize
93 if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
96 for (int i = 0; i < (processors-1); i++) {
97 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
99 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
100 int i = processors - 1;
101 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
104 //map query sequences to their most similiar sequences in the template - Parallelized
105 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
106 if (processors == 1) { findPairs(lines[0]->start, lines[0]->end); }
107 else { createProcessesPairs(); }
108 mothurOut("Done."); mothurOutEndLine();
110 //find Oqs for each sequence - the observed distance in each window - Parallelized
111 mothurOut("Calculating observed percentage differences for each sequence... "); cout.flush();
112 if (processors == 1) { calcObserved(lines[0]->start, lines[0]->end); }
113 else { createProcessesObserved(); }
114 mothurOut("Done."); mothurOutEndLine();
117 mothurOut("Calculating expected percentage differences for each sequence... "); cout.flush();
118 vector<float> probabilityProfile = calcFreq(templateSeqs);
121 for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; }
124 averageProbability = findQav(probabilityProfile);
126 //find Coefficient - maps a sequence to its coefficient
127 seqCoef = getCoef(averageProbability);
129 //find Eqs for each sequence - the expected distance in each window - Parallelized
130 if (processors == 1) { calcExpected(lines[0]->start, lines[0]->end); }
131 else { createProcessesExpected(); }
132 mothurOut("Done."); mothurOutEndLine();
134 //find deviation - Parallelized
135 mothurOut("Finding deviation from expected... "); cout.flush();
136 if (processors == 1) { calcDE(lines[0]->start, lines[0]->end); }
137 else { createProcessesDE(); }
138 mothurOut("Done."); mothurOutEndLine();
142 for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
143 delete distCalculator;
146 catch(exception& e) {
147 errorOut(e, "Pintail", "getChimeras");
152 //***************************************************************************************************************
154 vector<Sequence*> Pintail::readSeqs(string file) {
158 openInputFile(file, in);
159 vector<Sequence*> container;
161 //read in seqs and store in vector
163 Sequence* current = new Sequence(in);
165 if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); }
166 //takes out stuff is needed
167 current->setUnaligned(current->getUnaligned());
169 container.push_back(current);
178 catch(exception& e) {
179 errorOut(e, "Pintail", "readSeqs");
184 //***************************************************************************************************************
185 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
186 void Pintail::findPairs(int start, int end) {
189 for(int i = start; i < end; i++){
191 float smallest = 10000.0;
192 Sequence query = *(querySeqs[i]);
194 for(int j = 0; j < templateSeqs.size(); j++){
196 Sequence temp = *(templateSeqs[j]);
198 distCalculator->calcDist(query, temp);
199 float dist = distCalculator->getDist();
201 if (dist < smallest) {
203 bestfit[querySeqs[i]] = templateSeqs[j];
210 catch(exception& e) {
211 errorOut(e, "Pintail", "findPairs");
215 //***************************************************************************************************************
216 void Pintail::calcObserved(int start, int end) {
220 for(int i = start; i < end; i++){
222 itBest = bestfit.find(querySeqs[i]);
226 if (itBest != bestfit.end()) {
227 query = itBest->first;
228 subject = itBest->second;
229 }else{ mothurOut("Error in calcObserved"); mothurOutEndLine(); }
230 //cout << query->getName() << '\t' << subject->getName() << endl;
233 for (int m = 0; m < iters; m++) {
235 string seqFrag = query->getAligned().substr(startpoint, window);
236 string seqFragsub = subject->getAligned().substr(startpoint, window);
239 for (int b = 0; b < seqFrag.length(); b++) {
241 //if this is not a gap
242 if ((isalpha(seqFrag[b])) && (isalpha(seqFragsub[b]))) {
243 //and they are different - penalize
244 if (seqFrag[b] != seqFragsub[b]) { diff++; }
248 //percentage of mismatched bases
249 float dist = diff / (float)seqFrag.length();
251 obsDistance[query].push_back(dist);
253 startpoint += increment;
258 catch(exception& e) {
259 errorOut(e, "Pintail", "calcObserved");
264 //***************************************************************************************************************
265 void Pintail::calcExpected(int start, int end) {
270 for(int i = start; i < end; i++){
272 itCoef = seqCoef.find(querySeqs[i]);
273 float coef = itCoef->second;
276 vector<float> queryExpected;
277 for (int m = 0; m < iters; m++) {
278 float expected = averageProbability[m] * coef;
279 queryExpected.push_back(expected);
280 //cout << "average variabilty over window = " << averageProbability[m] << " coef = " << coef << " ei = " << expected << '\t' << "window = " << m << endl;
283 expectedDistance[querySeqs[i]] = queryExpected;
288 catch(exception& e) {
289 errorOut(e, "Pintail", "calcExpected");
293 //***************************************************************************************************************
294 void Pintail::calcDE(int start, int end) {
299 for(int i = start; i < end; i++){
301 itObsDist = obsDistance.find(querySeqs[i]);
302 vector<float> obs = itObsDist->second;
304 itExpDist = expectedDistance.find(querySeqs[i]);
305 vector<float> exp = itExpDist->second;
306 // cout << "difference between obs and exp = " << abs(obs[m] - exp[m]) << endl;
308 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
309 for (int m = 0; m < iters; m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); }
311 float de = sqrt((sum / (iters - 1)));
313 DE[querySeqs[i]] = de;
317 catch(exception& e) {
318 errorOut(e, "Pintail", "calcDE");
323 //***************************************************************************************************************
325 vector<float> Pintail::calcFreq(vector<Sequence*> seqs) {
330 //at each position in the sequence
331 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
333 vector<int> freq; freq.resize(4,0);
336 //find the frequency of each nucleotide
337 for (int j = 0; j < seqs.size(); j++) {
339 char value = seqs[j]->getAligned()[i];
341 if(toupper(value) == 'A') { freq[0]++; }
342 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
343 else if(toupper(value) == 'G') { freq[2]++; }
344 else if(toupper(value) == 'C') { freq[3]++; }
348 //find base with highest frequency
350 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
352 //add in gaps - so you can effectively "ignore them"
355 float highFreq = highest / (float) seqs.size();
358 Pi = (highFreq - 0.25) / 0.75;
366 catch(exception& e) {
367 errorOut(e, "Pintail", "calcFreq");
371 //***************************************************************************************************************
372 vector<float> Pintail::findQav(vector<float> prob) {
374 vector<float> averages;
376 //for each window find average
378 for (int m = 0; m < iters; m++) {
381 for (int i = startpoint; i < (startpoint+window); i++) { average += prob[i]; }
383 average = average / window;
384 //cout << average << endl;
385 //save this windows average
386 averages.push_back(average);
388 startpoint += increment;
393 catch(exception& e) {
394 errorOut(e, "Pintail", "findQav");
398 //***************************************************************************************************************
399 map<Sequence*, float> Pintail::getCoef(vector<float> prob) {
401 map<Sequence*, float> coefs;
404 float probAverage = 0.0;
405 for (int i = 0; i < prob.size(); i++) { probAverage += prob[i]; }
406 probAverage = probAverage / (float) prob.size();
407 cout << "(sum of ai) / m = " << probAverage << endl;
408 //find a coef for each sequence
409 map<Sequence*, vector<float> >::iterator it;
410 for (it = obsDistance.begin(); it != obsDistance.end(); it++) {
412 vector<float> temp = it->second;
413 Sequence* tempSeq = it->first;
415 //find observed average
416 float obsAverage = 0.0;
417 for (int i = 0; i < temp.size(); i++) { obsAverage += temp[i]; }
418 obsAverage = obsAverage / (float) temp.size();
419 cout << tempSeq->getName() << '\t' << obsAverage << endl;
420 float coef = obsAverage / probAverage;
421 cout << tempSeq->getName() << '\t' << "coef = " << coef << endl;
422 //save this sequences coefficient
423 coefs[tempSeq] = coef;
429 catch(exception& e) {
430 errorOut(e, "Pintail", "getCoef");
435 /**************************************************************************************************/
437 void Pintail::createProcessesPairs() {
439 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
441 vector<int> processIDS;
443 //loop through and create all the processes you want
444 while (process != processors) {
448 processIDS.push_back(pid);
451 findPairs(lines[process]->start, lines[process]->end);
453 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
456 //force parent to wait until all the processes are done
457 for (int i=0;i<processors;i++) {
458 int temp = processIDS[i];
463 findPairs(lines[0]->start, lines[0]->end);
467 catch(exception& e) {
468 errorOut(e, "Pintail", "createProcessesPairs");
473 /**************************************************************************************************/
475 void Pintail::createProcessesObserved() {
477 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
479 vector<int> processIDS;
481 //loop through and create all the processes you want
482 while (process != processors) {
486 processIDS.push_back(pid);
489 calcObserved(lines[process]->start, lines[process]->end);
491 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
494 //force parent to wait until all the processes are done
495 for (int i=0;i<processors;i++) {
496 int temp = processIDS[i];
501 calcObserved(lines[0]->start, lines[0]->end);
505 catch(exception& e) {
506 errorOut(e, "Pintail", "createProcessesObserved");
511 //***************************************************************************************************************
513 void Pintail::createProcessesExpected() {
515 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
517 vector<int> processIDS;
519 //loop through and create all the processes you want
520 while (process != processors) {
524 processIDS.push_back(pid);
527 calcExpected(lines[process]->start, lines[process]->end);
529 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
532 //force parent to wait until all the processes are done
533 for (int i=0;i<processors;i++) {
534 int temp = processIDS[i];
539 calcExpected(lines[0]->start, lines[0]->end);
543 catch(exception& e) {
544 errorOut(e, "Pintail", "createProcessesExpected");
549 /**************************************************************************************************/
551 void Pintail::createProcessesDE() {
553 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
555 vector<int> processIDS;
557 //loop through and create all the processes you want
558 while (process != processors) {
562 processIDS.push_back(pid);
565 calcDE(lines[process]->start, lines[process]->end);
567 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
570 //force parent to wait until all the processes are done
571 for (int i=0;i<processors;i++) {
572 int temp = processIDS[i];
577 calcDE(lines[0]->start, lines[0]->end);
581 catch(exception& e) {
582 errorOut(e, "Pintail", "createProcessesDE");
587 //***************************************************************************************************************