]> git.donarmstrong.com Git - mothur.git/blob - pintail.cpp
working on chimeras
[mothur.git] / pintail.cpp
1 /*
2  *  pintail.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "pintail.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14 //********************************************************************************************************************
15 //sorts lowest to highest
16 inline bool compareQuanMembers(quanMember left, quanMember right){
17         return (left.score < right.score);      
18
19 //***************************************************************************************************************
20
21 Pintail::Pintail(string filename, string o) {  
22         fastafile = filename;  outputDir = o; 
23         distcalculator = new eachGapDist();
24         decalc = new DeCalculator();
25 }
26 //***************************************************************************************************************
27
28 Pintail::~Pintail() {
29         try {
30                 
31                 delete distcalculator;
32                 delete decalc; 
33         }
34         catch(exception& e) {
35                 errorOut(e, "Pintail", "~Pintail");
36                 exit(1);
37         }
38 }
39 //***************************************************************************************************************
40 void Pintail::doPrep() {
41         try {
42         
43                 windowSizesTemplate.resize(templateSeqs.size(), window);
44                 quantiles.resize(100);  //one for every percent mismatch
45                 quantilesMembers.resize(100);  //one for every percent mismatch
46                 
47                 //if the user does not enter a mask then you want to keep all the spots in the alignment
48                 if (seqMask.length() == 0)      {       decalc->setAlignmentLength(templateSeqs[0]->getAligned().length());     }
49                 else                                            {       decalc->setAlignmentLength(seqMask.length());                                           }
50                 
51                 decalc->setMask(seqMask);
52                 
53                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
54                         //find breakup of templatefile for quantiles
55                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
56                         else { 
57                                 for (int i = 0; i < processors; i++) {
58                                         templateLines.push_back(new linePair());
59                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
60                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
61                                 }
62                         }
63                 #else
64                         templateLines.push_back(new linePair(0, templateSeqs.size()));
65                 #endif
66
67                 
68                 mothurOut("Getting conservation... "); cout.flush();
69                 if (consfile == "") { 
70                         mothurOut("Calculating probability of conservation for your template sequences.  This can take a while...  I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command.  Providing the .freq file will improve speed.    "); cout.flush();
71                         probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName)); 
72                         mothurOut("Done."); mothurOutEndLine();
73                 }else                           {   probabilityProfile = readFreq();    mothurOut("Done.");               }
74                 mothurOutEndLine();
75         
76                 //quantiles are used to determine whether the de values found indicate a chimera
77                 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
78                 //combination of sequences in the template
79                 if (quanfile != "") {  quantiles = readQuantiles();  }
80                 else {
81                         
82                         //if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras
83                         if (seqMask != "") {
84                                 //mask templates
85                                 for (int i = 0; i < templateSeqs.size(); i++) {
86                                         decalc->runMask(templateSeqs[i]);
87                                 }
88                         }
89                         
90                         if (filter) {
91                                 createFilter(templateSeqs);
92                                 for (int i = 0; i < templateSeqs.size(); i++) {  runFilter(templateSeqs[i]);  }
93                         }
94
95                         
96                         mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
97                         if (processors == 1) { 
98                                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
99                         }else {         createProcessesQuan();          }
100                 
101                         
102                         ofstream out4, out5;
103                         string noOutliers, outliers;
104                         
105                         if ((!filter) && (seqMask == "")) {
106                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
107                         }else if ((filter) && (seqMask == "")) { 
108                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan";
109                         }else if ((!filter) && (seqMask != "")) { 
110                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
111                         }else if ((filter) && (seqMask != "")) { 
112                                 noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan";
113                         }
114
115                                                 
116                         decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
117                         
118                         openOutputFile(noOutliers, out5);
119                         
120                         //adjust quantiles
121                         for (int i = 0; i < quantilesMembers.size(); i++) {
122                                 vector<float> temp;
123                                 
124                                 if (quantilesMembers[i].size() == 0) {
125                                         //in case this is not a distance found in your template files
126                                         for (int g = 0; g < 6; g++) {
127                                                 temp.push_back(0.0);
128                                         }
129                                 }else{
130                                         
131                                         sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
132                                         
133                                         //save 10%
134                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
135                                         //save 25%
136                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
137                                         //save 50%
138                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
139                                         //save 75%
140                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
141                                         //save 95%
142                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
143                                         //save 99%
144                                         temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
145                                         
146                                 }
147                                 
148                                 //output quan value
149                                 out5 << i+1 << '\t';                            
150                                 for (int u = 0; u < temp.size(); u++) {   out5 << temp[u] << '\t'; }
151                                 out5 << endl;
152                                 
153                                 quantiles[i] = temp;
154                                 
155                         }
156
157                         mothurOut("Done."); mothurOutEndLine();
158                         
159                         //if mask reread template
160                         if ((seqMask != "") || (filter)) {
161                                 for (int i = 0; i < templateSeqs.size(); i++)           {  delete templateSeqs[i];              }
162                                 templateSeqs = readSeqs(templateFileName);
163                         }
164                         
165                 }
166                 
167                 //free memory
168                 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i];  }
169                 
170         }
171         catch(exception& e) {
172                 errorOut(e, "Pintail", "doPrep");
173                 exit(1);
174         }
175 }
176 //***************************************************************************************************************
177 void Pintail::print(ostream& out) {
178         try {
179                 int index = ceil(deviation);
180                 
181                 //is your DE value higher than the 95%
182                 string chimera;
183                 if (index != 0) {  //if index is 0 then its an exact match to a template seq
184                         if (quantiles[index][4] == 0.0) {
185                                 chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
186                         }else {
187                                 if (DE > quantiles[index][4])           {       chimera = "Yes";        }
188                                 else                                                            {       chimera = "No";         }
189                         }
190                 }else{ chimera = "No";          }
191                 
192                 out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
193                 if (chimera == "Yes") {
194                         mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine();
195                 }
196                 out << "Observed\t";
197                 
198                 for (int j = 0; j < obsDistance.size(); j++) {  out << obsDistance[j] << '\t';  }
199                 out << endl;
200                 
201                 out << "Expected\t";
202                 
203                 for (int m = 0; m < expectedDistance.size(); m++) {  out << expectedDistance[m] << '\t';  }
204                 out << endl;
205                 
206                 
207         }
208         catch(exception& e) {
209                 errorOut(e, "Pintail", "print");
210                 exit(1);
211         }
212 }
213
214 //***************************************************************************************************************
215 int Pintail::getChimeras(Sequence* query) {
216         try {
217                 querySeq = query;
218                 trimmed.clear();
219                 windowSizes = window;
220                                 
221                 //find pairs
222                 bestfit = findPairs(query);
223                                                         
224                 //make P into Q
225                 for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
226         
227                 //mask sequences if the user wants to 
228                 if (seqMask != "") {
229                         decalc->runMask(query);
230                         decalc->runMask(bestfit);
231                 }
232                 
233                 if (filter) {
234                         runFilter(query);
235                         runFilter(bestfit);
236                 }
237                 
238                 //trim seq
239                 decalc->trimSeqs(query, bestfit, trimmed);  
240                 
241                 //find windows
242                 it = trimmed.begin();
243                 windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
244
245                 //find observed distance
246                 obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
247                                 
248                 Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
249
250                 //find alpha                    
251                 seqCoef = decalc->getCoef(obsDistance, Qav);
252                 
253                 //calculating expected distance
254                 expectedDistance = decalc->calcExpected(Qav, seqCoef);
255                 
256                 //finding de
257                 DE = decalc->calcDE(obsDistance, expectedDistance);
258                 
259                 //find distance between query and closest match
260                 it = trimmed.begin();
261                 deviation = decalc->calcDist(query, bestfit, it->first, it->second); 
262                 
263                 delete bestfit;
264                                                                         
265                 return 0;
266         }
267         catch(exception& e) {
268                 errorOut(e, "Pintail", "getChimeras");
269                 exit(1);
270         }
271 }
272
273 //***************************************************************************************************************
274
275 vector<float> Pintail::readFreq() {
276         try {
277         
278                 ifstream in;
279                 openInputFile(consfile, in);
280                 
281                 vector<float> prob;
282                 set<int> h = decalc->getPos();  //positions of bases in masking sequence
283                 
284                 //read in probabilities and store in vector
285                 int pos; float num; 
286                 
287                 while(!in.eof()){
288                         
289                         in >> pos >> num;
290                         
291                         if (h.count(pos) > 0) {
292                                 float Pi;
293                                 Pi =  (num - 0.25) / 0.75; 
294                         
295                                 //cannot have probability less than 0.
296                                 if (Pi < 0) { Pi = 0.0; }
297
298                                 //do you want this spot
299                                 prob.push_back(Pi);  
300                         }
301                         
302                         gobble(in);
303                 }
304                 
305                 in.close();
306                 return prob;
307                 
308         }
309         catch(exception& e) {
310                 errorOut(e, "Pintail", "readFreq");
311                 exit(1);
312         }
313 }
314
315 //***************************************************************************************************************
316 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
317 Sequence* Pintail::findPairs(Sequence* q) {
318         try {
319                 
320                 Sequence* seqsMatches;  
321                 
322                 vector<Sequence*> copy = decalc->findClosest(q, templateSeqs, 1);
323                 seqsMatches = copy[0];
324                 
325                 return seqsMatches;
326         
327         }
328         catch(exception& e) {
329                 errorOut(e, "Pintail", "findPairs");
330                 exit(1);
331         }
332 }
333 /**************************************************************************************************/
334 void Pintail::createProcessesQuan() {
335         try {
336 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
337                 int process = 0;
338                 vector<int> processIDS;
339                                 
340                 //loop through and create all the processes you want
341                 while (process != processors) {
342                         int pid = fork();
343                         
344                         if (pid > 0) {
345                                 processIDS.push_back(pid);  
346                                 process++;
347                         }else if (pid == 0){
348                                 
349                                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
350                                 
351                                 //write out data to file so parent can read it
352                                 ofstream out;
353                                 string s = toString(getpid()) + ".temp";
354                                 openOutputFile(s, out);
355                                 
356                                                                 
357                                 //output observed distances
358                                 for (int i = 0; i < quantilesMembers.size(); i++) {
359                                         out << quantilesMembers[i].size() << '\t';
360                                         for (int j = 0; j < quantilesMembers[i].size(); j++) {
361                                                 out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
362                                         }
363                                         out << endl;
364                                 }
365                                 
366                                 out.close();
367                                 
368                                 exit(0);
369                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
370                 }
371                 
372                 //force parent to wait until all the processes are done
373                 for (int i=0;i<processors;i++) { 
374                         int temp = processIDS[i];
375                         wait(&temp);
376                 }
377
378                 //get data created by processes
379                 for (int i=0;i<processors;i++) { 
380                         ifstream in;
381                         string s = toString(processIDS[i]) + ".temp";
382                         openInputFile(s, in);
383                         
384                         vector< vector<quanMember> > quan; 
385                         quan.resize(100);
386                         
387                         //get quantiles
388                         for (int m = 0; m < quan.size(); m++) {
389                                 int num;
390                                 in >> num; 
391                                 
392                                 gobble(in);
393
394                                 vector<quanMember> q;  float w; int b, n;
395                                 for (int j = 0; j < num; j++) {
396                                         in >> w >> b >> n;
397         //cout << w << '\t' << b << '\t' n << endl;
398                                         quanMember newMember(w, b, n);
399                                         q.push_back(newMember);
400                                 }
401 //cout << "here" << endl;
402                                 quan[m] = q;
403 //cout << "now here" << endl;
404                                 gobble(in);
405                         }
406                         
407         
408                         //save quan in quantiles
409                         for (int j = 0; j < quan.size(); j++) {
410                                 //put all values of q[i] into quan[i]
411                                 for (int l = 0; l < quan[j].size(); l++) {  quantilesMembers[j].push_back(quan[j][l]);   }
412                                 //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
413                         }
414                                         
415                         in.close();
416                         remove(s.c_str());
417                 }
418                 
419 #else
420                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
421 #endif          
422         }
423         catch(exception& e) {
424                 errorOut(e, "Pintail", "createProcessesQuan");
425                 exit(1);
426         }
427 }
428
429
430 //***************************************************************************************************************
431
432