]> git.donarmstrong.com Git - mothur.git/blob - pintail.cpp
added warning about merging with something above cutoff to cluster. working on chimeras
[mothur.git] / pintail.cpp
1 /*
2  *  pintail.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "pintail.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17         return (left.score < right.score);      
18
19 //***************************************************************************************************************
20
21 Pintail::Pintail(string filename, string o) {  
22         fastafile = filename;  outputDir = o; 
23         distcalculator = new eachGapDist();
24         decalc = new DeCalculator();
25 }
26 //***************************************************************************************************************
27
28 Pintail::~Pintail() {
29         try {
30                 
31                 delete distcalculator;
32                 delete decalc; 
33         }
34         catch(exception& e) {
35                 errorOut(e, "Pintail", "~Pintail");
36                 exit(1);
37         }
38 }
39 //***************************************************************************************************************
40 void Pintail::doPrep() {
41         try {
42                 
43                 mergedFilterString = "";
44                 windowSizesTemplate.resize(templateSeqs.size(), window);
45                 quantiles.resize(100);  //one for every percent mismatch
46                 quantilesMembers.resize(100);  //one for every percent mismatch
47                 
48                 //if the user does not enter a mask then you want to keep all the spots in the alignment
49                 if (seqMask.length() == 0)      {       decalc->setAlignmentLength(templateSeqs[0]->getAligned().length());     }
50                 else                                            {       decalc->setAlignmentLength(seqMask.length());                                           }
51                 
52                 decalc->setMask(seqMask);
53                 
54                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
55                         //find breakup of templatefile for quantiles
56                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
57                         else { 
58                                 for (int i = 0; i < processors; i++) {
59                                         templateLines.push_back(new linePair());
60                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
61                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
62                                 }
63                         }
64                 #else
65                         templateLines.push_back(new linePair(0, templateSeqs.size()));
66                 #endif
67
68                 
69                 mothurOut("Getting conservation... "); cout.flush();
70                 if (consfile == "") { 
71                         mothurOut("Calculating probability of conservation for your template sequences.  This can take a while...  I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command.  Providing the .freq file will improve speed.    "); cout.flush();
72                         probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName)); 
73                         mothurOut("Done."); mothurOutEndLine();
74                 }else                           {   probabilityProfile = readFreq();    mothurOut("Done.");               }
75                 mothurOutEndLine();
76                 
77                 //make P into Q
78                 for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
79                 
80                 bool reRead = false;
81                 //create filter if needed for later
82                 if (filter) {
83                         
84 cout << "filter" << endl;                       
85                                                 
86                         //read in all query seqs
87                         ifstream in; 
88                         openInputFile(fastafile, in);
89                         
90                         vector<Sequence*> tempQuerySeqs;
91                         while(!in.eof()){
92                                 Sequence* s = new Sequence(in);
93                                 gobble(in);
94                                 
95                                 if (s->getName() != "") { tempQuerySeqs.push_back(s); }
96                         }
97                         in.close();
98                         
99                         vector<Sequence*> temp;
100                         //merge query seqs and template seqs
101                         temp = templateSeqs;
102                         for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
103 cout << temp.size() << endl;    
104                         if (seqMask != "") {
105                             reRead = true;
106         cout << "masked "  << seqMask << endl;
107                                 //mask templates
108                                 for (int i = 0; i < temp.size(); i++) {
109                                         decalc->runMask(temp[i]);
110                                 }
111                         }
112
113                         mergedFilterString = createFilter(temp, 0.5);
114         cout << "here" << endl;         
115                         //reread template seqs
116                         //for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
117                 }
118                 
119                 
120                 //quantiles are used to determine whether the de values found indicate a chimera
121                 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
122                 //combination of sequences in the template
123                 if (quanfile != "") {  
124                         quantiles = readQuantiles(); 
125                 }else {
126                         if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
127                                 reRead = true;
128                                 //mask templates
129                                 for (int i = 0; i < templateSeqs.size(); i++) {
130                                         decalc->runMask(templateSeqs[i]);
131                                 }
132                         }
133                         
134                         if (filter) { 
135                                 reRead = true;
136                                 for (int i = 0; i < templateSeqs.size(); i++) {
137                                         runFilter(templateSeqs[i]);
138                                 }
139                         }
140                         
141                         mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
142                         if (processors == 1) { 
143                                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
144                         }else {         createProcessesQuan();          }
145                 
146                         
147                         ofstream out4, out5;
148                         string noOutliers, outliers;
149                         
150                         if ((!filter) && (seqMask == "")) {
151                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
152                         }else if ((!filter) && (seqMask != "")) { 
153                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
154                         }else if ((filter) && (seqMask != "")) { 
155                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
156                         }else if ((filter) && (seqMask == "")) { 
157                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
158                         }
159
160
161
162                                                 
163                         decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
164                         
165                         openOutputFile(noOutliers, out5);                       
166                         //adjust quantiles
167                         for (int i = 0; i < quantilesMembers.size(); i++) {
168                                 vector<float> temp;
169                                 
170                                 if (quantilesMembers[i].size() == 0) {
171                                         //in case this is not a distance found in your template files
172                                         for (int g = 0; g < 6; g++) {
173                                                 temp.push_back(0.0);
174                                         }
175                                 }else{
176                                         
177                                         sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
178                                         
179                                         //save 10%
180                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
181                                         //save 25%
182                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
183                                         //save 50%
184                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
185                                         //save 75%
186                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
187                                         //save 95%
188                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
189                                         //save 99%
190                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
191                                         
192                                 }
193                                 
194                                 //output quan value
195                                 out5 << i+1 << '\t';                            
196                                 for (int u = 0; u < temp.size(); u++) {   out5 << temp[u] << '\t'; }
197                                 out5 << endl;
198                                 
199                                 quantiles[i] = temp;
200                                 
201                         }
202
203                         mothurOut("Done."); mothurOutEndLine();
204                 }
205                 
206                 if (reRead) {
207                         for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i];  }
208                         templateSeqs.clear();
209                         templateSeqs = readSeqs(templateFileName);
210                 }
211
212                 
213                 //free memory
214                 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i];  }
215                 
216         }
217         catch(exception& e) {
218                 errorOut(e, "Pintail", "doPrep");
219                 exit(1);
220         }
221 }
222 //***************************************************************************************************************
223 void Pintail::print(ostream& out, ostream& outAcc) {
224         try {
225                 int index = ceil(deviation);
226                 
227                 //is your DE value higher than the 95%
228                 string chimera;
229                 if (index != 0) {  //if index is 0 then its an exact match to a template seq
230                         if (quantiles[index][4] == 0.0) {
231                                 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
232                         }else {
233                                 if (DE > quantiles[index][4])           {       chimera = "Yes";        }
234                                 else                                                            {       chimera = "No";         }
235                         }
236                 }else{ chimera = "No";          }
237                 
238                 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
239                 if (chimera == "Yes") {
240                         mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine();
241                         outAcc << querySeq->getName() << endl;
242                 }
243                 out << "Observed\t";
244                 
245                 for (int j = 0; j < obsDistance.size(); j++) {  out << obsDistance[j] << '\t';  }
246                 out << endl;
247                 
248                 out << "Expected\t";
249                 
250                 for (int m = 0; m < expectedDistance.size(); m++) {  out << expectedDistance[m] << '\t';  }
251                 out << endl;
252                 
253                 
254         }
255         catch(exception& e) {
256                 errorOut(e, "Pintail", "print");
257                 exit(1);
258         }
259 }
260
261 //***************************************************************************************************************
262 int Pintail::getChimeras(Sequence* query) {
263         try {
264                 querySeq = query;
265                 trimmed.clear();
266                 windowSizes = window;
267                                                         
268                 //find pairs has to be done before a mask
269                 bestfit = findPairs(query);
270                 
271                 //if they mask  
272                 if (seqMask != "") {
273                         decalc->runMask(query);
274                         decalc->runMask(bestfit);
275                 }
276
277                 if (filter) { //must be done after a mask
278                         runFilter(query);
279                         runFilter(bestfit);
280                 }
281                 
282                                 
283                 //trim seq
284                 decalc->trimSeqs(query, bestfit, trimmed);  
285                 
286                 //find windows
287                 it = trimmed.begin();
288                 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
289
290                 //find observed distance
291                 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
292                                 
293                 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
294
295                 //find alpha                    
296                 seqCoef = decalc->getCoef(obsDistance, Qav);
297                 
298                 //calculating expected distance
299                 expectedDistance = decalc->calcExpected(Qav, seqCoef);
300                 
301                 //finding de
302                 DE = decalc->calcDE(obsDistance, expectedDistance);
303                 
304                 //find distance between query and closest match
305                 it = trimmed.begin();
306                 deviation = decalc->calcDist(query, bestfit, it->first, it->second); 
307                 
308                 delete bestfit;
309                                                                         
310                 return 0;
311         }
312         catch(exception& e) {
313                 errorOut(e, "Pintail", "getChimeras");
314                 exit(1);
315         }
316 }
317
318 //***************************************************************************************************************
319
320 vector<float> Pintail::readFreq() {
321         try {
322         
323                 ifstream in;
324                 openInputFile(consfile, in);
325                 
326                 vector<float> prob;
327                 set<int> h = decalc->getPos();  //positions of bases in masking sequence
328                 
329                 //read in probabilities and store in vector
330                 int pos; float num; 
331                 
332                 while(!in.eof()){
333                         
334                         in >> pos >> num;
335                         
336                         if (h.count(pos) > 0) {
337                                 float Pi;
338                                 Pi =  (num - 0.25) / 0.75; 
339                         
340                                 //cannot have probability less than 0.
341                                 if (Pi < 0) { Pi = 0.0; }
342
343                                 //do you want this spot
344                                 prob.push_back(Pi);  
345                         }
346                         
347                         gobble(in);
348                 }
349                 
350                 in.close();
351                 return prob;
352                 
353         }
354         catch(exception& e) {
355                 errorOut(e, "Pintail", "readFreq");
356                 exit(1);
357         }
358 }
359
360 //***************************************************************************************************************
361 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
362 Sequence* Pintail::findPairs(Sequence* q) {
363         try {
364                 
365                 Sequence* seqsMatches;  
366                 
367                 seqsMatches = decalc->findClosest(q, templateSeqs);
368                 return seqsMatches;
369         
370         }
371         catch(exception& e) {
372                 errorOut(e, "Pintail", "findPairs");
373                 exit(1);
374         }
375 }
376 /**************************************************************************************************/
377 void Pintail::createProcessesQuan() {
378         try {
379 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
380                 int process = 0;
381                 vector<int> processIDS;
382                                 
383                 //loop through and create all the processes you want
384                 while (process != processors) {
385                         int pid = fork();
386                         
387                         if (pid > 0) {
388                                 processIDS.push_back(pid);  
389                                 process++;
390                         }else if (pid == 0){
391                                 
392                                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
393                                 
394                                 //write out data to file so parent can read it
395                                 ofstream out;
396                                 string s = toString(getpid()) + ".temp";
397                                 openOutputFile(s, out);
398                                 
399                                                                 
400                                 //output observed distances
401                                 for (int i = 0; i < quantilesMembers.size(); i++) {
402                                         out << quantilesMembers[i].size() << '\t';
403                                         for (int j = 0; j < quantilesMembers[i].size(); j++) {
404                                                 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
405                                         }
406                                         out << endl;
407                                 }
408                                 
409                                 out.close();
410                                 
411                                 exit(0);
412                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
413                 }
414                 
415                 //force parent to wait until all the processes are done
416                 for (int i=0;i<processors;i++) { 
417                         int temp = processIDS[i];
418                         wait(&temp);
419                 }
420
421                 //get data created by processes
422                 for (int i=0;i<processors;i++) { 
423                         ifstream in;
424                         string s = toString(processIDS[i]) + ".temp";
425                         openInputFile(s, in);
426                         
427                         vector< vector<quanMember> > quan; 
428                         quan.resize(100);
429                         
430                         //get quantiles
431                         for (int m = 0; m < quan.size(); m++) {
432                                 int num;
433                                 in >> num; 
434                                 
435                                 gobble(in);
436
437                                 vector<quanMember> q;  float w; int b, n;
438                                 for (int j = 0; j < num; j++) {
439                                         in >> w >> b >> n;
440         //cout << w << '\t' << b << '\t' n << endl;
441                                         quanMember newMember(w, b, n);
442                                         q.push_back(newMember);
443                                 }
444 //cout << "here" << endl;
445                                 quan[m] = q;
446 //cout << "now here" << endl;
447                                 gobble(in);
448                         }
449                         
450         
451                         //save quan in quantiles
452                         for (int j = 0; j < quan.size(); j++) {
453                                 //put all values of q[i] into quan[i]
454                                 for (int l = 0; l < quan[j].size(); l++) {  quantilesMembers[j].push_back(quan[j][l]);   }
455                                 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
456                         }
457                                         
458                         in.close();
459                         remove(s.c_str());
460                 }
461                 
462 #else
463                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
464 #endif          
465         }
466         catch(exception& e) {
467                 errorOut(e, "Pintail", "createProcessesQuan");
468                 exit(1);
469         }
470 }
471
472
473 //***************************************************************************************************************
474
475