]> git.donarmstrong.com Git - mothur.git/blob - pintail.cpp
added checks for ^C to quit command instead of program
[mothur.git] / pintail.cpp
1 /*
2  *  pintail.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "pintail.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17         return (left.score < right.score);      
18
19 //***************************************************************************************************************
20
21 Pintail::Pintail(string filename, string o) {  
22         fastafile = filename;  outputDir = o; 
23         distcalculator = new eachGapDist();
24         decalc = new DeCalculator();
25 }
26 //***************************************************************************************************************
27
28 Pintail::~Pintail() {
29         try {
30                 
31                 delete distcalculator;
32                 delete decalc; 
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "Pintail", "~Pintail");
36                 exit(1);
37         }
38 }
39 //***************************************************************************************************************
40 int Pintail::doPrep() {
41         try {
42                 
43                 mergedFilterString = "";
44                 windowSizesTemplate.resize(templateSeqs.size(), window);
45                 quantiles.resize(100);  //one for every percent mismatch
46                 quantilesMembers.resize(100);  //one for every percent mismatch
47                 
48                 //if the user does not enter a mask then you want to keep all the spots in the alignment
49                 if (seqMask.length() == 0)      {       decalc->setAlignmentLength(templateSeqs[0]->getAligned().length());     }
50                 else                                            {       decalc->setAlignmentLength(seqMask.length());                                           }
51                 
52                 decalc->setMask(seqMask);
53                 
54                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
55                         //find breakup of templatefile for quantiles
56                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
57                         else { 
58                                 for (int i = 0; i < processors; i++) {
59                                         templateLines.push_back(new linePair());
60                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
61                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
62                                 }
63                         }
64                 #else
65                         templateLines.push_back(new linePair(0, templateSeqs.size()));
66                 #endif
67
68                 
69                 m->mothurOut("Getting conservation... "); cout.flush();
70                 if (consfile == "") { 
71                         m->mothurOut("Calculating probability of conservation for your template sequences.  This can take a while...  I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command.  Providing the .freq file will improve speed.    "); cout.flush();
72                         probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName)); 
73                         if (m->control_pressed) {  return 0;  }
74                         m->mothurOut("Done."); m->mothurOutEndLine();
75                 }else                           {   probabilityProfile = readFreq();    m->mothurOut("Done.");            }
76                 m->mothurOutEndLine();
77                 
78                 //make P into Q
79                 for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
80                 
81                 bool reRead = false;
82                 //create filter if needed for later
83                 if (filter) {
84                                                 
85                         //read in all query seqs
86                         ifstream in; 
87                         openInputFile(fastafile, in);
88                         
89                         vector<Sequence*> tempQuerySeqs;
90                         while(!in.eof()){
91                                 if (m->control_pressed) {  
92                                         for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
93                                         return 0; 
94                                 }
95                                 
96                                 Sequence* s = new Sequence(in);
97                                 gobble(in);
98                                 
99                                 if (s->getName() != "") { tempQuerySeqs.push_back(s); }
100                         }
101                         in.close();
102                         
103                         vector<Sequence*> temp;
104                         //merge query seqs and template seqs
105                         temp = templateSeqs;
106                         for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
107         
108                         if (seqMask != "") {
109                             reRead = true;
110                                 //mask templates
111                                 for (int i = 0; i < temp.size(); i++) {
112                                         if (m->control_pressed) {  
113                                                 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
114                                                 return 0; 
115                                         }
116                                         decalc->runMask(temp[i]);
117                                 }
118                         }
119
120                         mergedFilterString = createFilter(temp, 0.5);
121                         
122                         if (m->control_pressed) {  
123                                 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
124                                 return 0; 
125                         }
126                         
127                         //reread template seqs
128                         for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
129                 }
130                 
131                 
132                 //quantiles are used to determine whether the de values found indicate a chimera
133                 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
134                 //combination of sequences in the template
135                 if (quanfile != "") {  
136                         quantiles = readQuantiles(); 
137                 }else {
138                         if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
139                                 reRead = true;
140                                 //mask templates
141                                 for (int i = 0; i < templateSeqs.size(); i++) {
142                                         if (m->control_pressed) {  return 0;  }
143                                         decalc->runMask(templateSeqs[i]);
144                                 }
145                         }
146                         
147                         if (filter) { 
148                                 reRead = true;
149                                 for (int i = 0; i < templateSeqs.size(); i++) {
150                                         if (m->control_pressed) {  return 0;  }
151                                         runFilter(templateSeqs[i]);
152                                 }
153                         }
154                         
155                         m->mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
156                         if (processors == 1) { 
157                                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
158                         }else {         createProcessesQuan();          }
159                 
160                         if (m->control_pressed) {  return 0;  }
161                         
162                         ofstream out4, out5;
163                         string noOutliers, outliers;
164                         
165                         if ((!filter) && (seqMask == "")) {
166                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
167                         }else if ((!filter) && (seqMask != "")) { 
168                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
169                         }else if ((filter) && (seqMask != "")) { 
170                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
171                         }else if ((filter) && (seqMask == "")) { 
172                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
173                         }
174
175                         decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
176                         
177                         if (m->control_pressed) {  return 0;  }
178                         
179                         openOutputFile(noOutliers, out5);                       
180                         //adjust quantiles
181                         for (int i = 0; i < quantilesMembers.size(); i++) {
182                                 vector<float> temp;
183                                 
184                                 if (quantilesMembers[i].size() == 0) {
185                                         //in case this is not a distance found in your template files
186                                         for (int g = 0; g < 6; g++) {
187                                                 temp.push_back(0.0);
188                                         }
189                                 }else{
190                                         
191                                         sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
192                                         
193                                         //save 10%
194                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
195                                         //save 25%
196                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
197                                         //save 50%
198                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
199                                         //save 75%
200                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
201                                         //save 95%
202                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
203                                         //save 99%
204                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
205                                         
206                                 }
207                                 
208                                 //output quan value
209                                 out5 << i+1 << '\t';                            
210                                 for (int u = 0; u < temp.size(); u++) {   out5 << temp[u] << '\t'; }
211                                 out5 << endl;
212                                 
213                                 quantiles[i] = temp;
214                                 
215                         }
216
217                         m->mothurOut("Done."); m->mothurOutEndLine();
218                 }
219                 
220                 if (reRead) {
221                         for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i];  }
222                         templateSeqs.clear();
223                         templateSeqs = readSeqs(templateFileName);
224                 }
225
226                 
227                 //free memory
228                 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i];  }
229                 
230                 return 0;
231                 
232         }
233         catch(exception& e) {
234                 m->errorOut(e, "Pintail", "doPrep");
235                 exit(1);
236         }
237 }
238 //***************************************************************************************************************
239 int Pintail::print(ostream& out, ostream& outAcc) {
240         try {
241                 int index = ceil(deviation);
242                 
243                 //is your DE value higher than the 95%
244                 string chimera;
245                 if (index != 0) {  //if index is 0 then its an exact match to a template seq
246                         if (quantiles[index][4] == 0.0) {
247                                 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
248                         }else {
249                                 if (DE > quantiles[index][4])           {       chimera = "Yes";        }
250                                 else                                                            {       chimera = "No";         }
251                         }
252                 }else{ chimera = "No";          }
253                 
254                 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
255                 if (chimera == "Yes") {
256                         m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
257                         outAcc << querySeq->getName() << endl;
258                 }
259                 out << "Observed\t";
260                 
261                 for (int j = 0; j < obsDistance.size(); j++) {  out << obsDistance[j] << '\t';  }
262                 out << endl;
263                 
264                 out << "Expected\t";
265                 
266                 for (int m = 0; m < expectedDistance.size(); m++) {  out << expectedDistance[m] << '\t';  }
267                 out << endl;
268                 
269                 return 0;
270                 
271         }
272         catch(exception& e) {
273                 m->errorOut(e, "Pintail", "print");
274                 exit(1);
275         }
276 }
277
278 //***************************************************************************************************************
279 int Pintail::getChimeras(Sequence* query) {
280         try {
281                 querySeq = query;
282                 trimmed.clear();
283                 windowSizes = window;
284                                                         
285                 //find pairs has to be done before a mask
286                 bestfit = findPairs(query);
287                 
288                 if (m->control_pressed) {  return 0; } 
289                 
290                 //if they mask  
291                 if (seqMask != "") {
292                         decalc->runMask(query);
293                         decalc->runMask(bestfit);
294                 }
295
296                 if (filter) { //must be done after a mask
297                         runFilter(query);
298                         runFilter(bestfit);
299                 }
300                 
301                                 
302                 //trim seq
303                 decalc->trimSeqs(query, bestfit, trimmed);  
304                 
305                 //find windows
306                 it = trimmed.begin();
307                 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
308
309                 //find observed distance
310                 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
311                 
312                 if (m->control_pressed) {  return 0; } 
313                                 
314                 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
315                 
316                 if (m->control_pressed) {  return 0; } 
317
318                 //find alpha                    
319                 seqCoef = decalc->getCoef(obsDistance, Qav);
320                 
321                 //calculating expected distance
322                 expectedDistance = decalc->calcExpected(Qav, seqCoef);
323                 
324                 if (m->control_pressed) {  return 0; } 
325                 
326                 //finding de
327                 DE = decalc->calcDE(obsDistance, expectedDistance);
328                 
329                 if (m->control_pressed) {  return 0; } 
330                 
331                 //find distance between query and closest match
332                 it = trimmed.begin();
333                 deviation = decalc->calcDist(query, bestfit, it->first, it->second); 
334                 
335                 delete bestfit;
336                                                                         
337                 return 0;
338         }
339         catch(exception& e) {
340                 m->errorOut(e, "Pintail", "getChimeras");
341                 exit(1);
342         }
343 }
344
345 //***************************************************************************************************************
346
347 vector<float> Pintail::readFreq() {
348         try {
349         
350                 ifstream in;
351                 openInputFile(consfile, in);
352                 
353                 vector<float> prob;
354                 set<int> h = decalc->getPos();  //positions of bases in masking sequence
355                 
356                 //read in probabilities and store in vector
357                 int pos; float num; 
358                 
359                 while(!in.eof()){
360                         
361                         in >> pos >> num;
362                         
363                         if (h.count(pos) > 0) {
364                                 float Pi;
365                                 Pi =  (num - 0.25) / 0.75; 
366                         
367                                 //cannot have probability less than 0.
368                                 if (Pi < 0) { Pi = 0.0; }
369
370                                 //do you want this spot
371                                 prob.push_back(Pi);  
372                         }
373                         
374                         gobble(in);
375                 }
376                 
377                 in.close();
378                 return prob;
379                 
380         }
381         catch(exception& e) {
382                 m->errorOut(e, "Pintail", "readFreq");
383                 exit(1);
384         }
385 }
386
387 //***************************************************************************************************************
388 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
389 Sequence* Pintail::findPairs(Sequence* q) {
390         try {
391                 
392                 Sequence* seqsMatches;  
393                 
394                 seqsMatches = decalc->findClosest(q, templateSeqs);
395                 return seqsMatches;
396         
397         }
398         catch(exception& e) {
399                 m->errorOut(e, "Pintail", "findPairs");
400                 exit(1);
401         }
402 }
403 /**************************************************************************************************/
404 void Pintail::createProcessesQuan() {
405         try {
406 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
407                 int process = 0;
408                 vector<int> processIDS;
409                                 
410                 //loop through and create all the processes you want
411                 while (process != processors) {
412                         int pid = fork();
413                         
414                         if (pid > 0) {
415                                 processIDS.push_back(pid);  
416                                 process++;
417                         }else if (pid == 0){
418                                 
419                                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
420                                 
421                                 //write out data to file so parent can read it
422                                 ofstream out;
423                                 string s = toString(getpid()) + ".temp";
424                                 openOutputFile(s, out);
425                                 
426                                                                 
427                                 //output observed distances
428                                 for (int i = 0; i < quantilesMembers.size(); i++) {
429                                         out << quantilesMembers[i].size() << '\t';
430                                         for (int j = 0; j < quantilesMembers[i].size(); j++) {
431                                                 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
432                                         }
433                                         out << endl;
434                                 }
435                                 
436                                 out.close();
437                                 
438                                 exit(0);
439                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
440                 }
441                 
442                 //force parent to wait until all the processes are done
443                 for (int i=0;i<processors;i++) { 
444                         int temp = processIDS[i];
445                         wait(&temp);
446                 }
447
448                 //get data created by processes
449                 for (int i=0;i<processors;i++) { 
450                         ifstream in;
451                         string s = toString(processIDS[i]) + ".temp";
452                         openInputFile(s, in);
453                         
454                         vector< vector<quanMember> > quan; 
455                         quan.resize(100);
456                         
457                         //get quantiles
458                         for (int m = 0; m < quan.size(); m++) {
459                                 int num;
460                                 in >> num; 
461                                 
462                                 gobble(in);
463
464                                 vector<quanMember> q;  float w; int b, n;
465                                 for (int j = 0; j < num; j++) {
466                                         in >> w >> b >> n;
467         //cout << w << '\t' << b << '\t' n << endl;
468                                         quanMember newMember(w, b, n);
469                                         q.push_back(newMember);
470                                 }
471 //cout << "here" << endl;
472                                 quan[m] = q;
473 //cout << "now here" << endl;
474                                 gobble(in);
475                         }
476                         
477         
478                         //save quan in quantiles
479                         for (int j = 0; j < quan.size(); j++) {
480                                 //put all values of q[i] into quan[i]
481                                 for (int l = 0; l < quan[j].size(); l++) {  quantilesMembers[j].push_back(quan[j][l]);   }
482                                 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
483                         }
484                                         
485                         in.close();
486                         remove(s.c_str());
487                 }
488                 
489 #else
490                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
491 #endif          
492         }
493         catch(exception& e) {
494                 m->errorOut(e, "Pintail", "createProcessesQuan");
495                 exit(1);
496         }
497 }
498
499
500 //***************************************************************************************************************
501
502