]> git.donarmstrong.com Git - mothur.git/blob - alignedsimilarity.cpp
finished with ccode, returned bellerophon to last save before move, cleaned up pintai...
[mothur.git] / alignedsimilarity.cpp
1 /*
2  *  alignedsimilarity.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/18/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "alignedsimilarity.h"
11 #include "ignoregaps.h"
12
13
14 //***************************************************************************************************************
15 AlignSim::AlignSim(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
16 //***************************************************************************************************************
17
18 AlignSim::~AlignSim() {
19         try {
20                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
21                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
22                 delete distCalc;
23         }
24         catch(exception& e) {
25                 errorOut(e, "AlignSim", "~AlignSim");
26                 exit(1);
27         }
28 }       
29 //***************************************************************************************************************
30 void AlignSim::print(ostream& out) {
31         try {
32                 
33                 mothurOutEndLine();
34                 
35                 for (int i = 0; i < querySeqs.size(); i++) {
36                         
37                         int j = 0;  float largest = -10;
38                         //find largest sim value
39                         for (int k = 0; k < IS[i].size(); k++) {
40                                 //is this score larger
41                                 if (IS[i][k].score > largest) {
42                                         j = k;
43                                         largest = IS[i][k].score;
44                                 }
45                         }
46                         
47                         //find parental similarity
48                         distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
49                         float dist = distCalc->getDist();
50                         
51                         //convert to similarity
52                         dist = (1 - dist) * 100;
53
54                         //warn about parental similarity - if its above 82% may not detect a chimera 
55                         if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly.");  mothurOutEndLine(); }
56                         
57                         int index = ceil(dist);
58                         
59                         if (index == 0) { index=1;  }
60         
61                         //is your DE value higher than the 95%
62                         string chimera;
63                         if (IS[i][j].score > quantile[index-1][4])              {       chimera = "Yes";        }
64                         else                                                                            {       chimera = "No";         }                       
65                         
66                         out << querySeqs[i]->getName() <<  "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
67                         
68                         if (chimera == "Yes") {
69                                 mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
70                         }
71                         out << "Improvement Score\t";
72                         
73                         for (int r = 0; r < IS[i].size(); r++) {  out << IS[i][r].score << '\t';  }
74                         out << endl;
75                 }
76         }
77         catch(exception& e) {
78                 errorOut(e, "AlignSim", "print");
79                 exit(1);
80         }
81 }
82
83 //***************************************************************************************************************
84 void AlignSim::getChimeras() {
85         try {
86                 
87                 //read in query sequences and subject sequences
88                 mothurOut("Reading sequences and template file... "); cout.flush();
89                 querySeqs = readSeqs(fastafile);
90                 templateSeqs = readSeqs(templateFile);
91                 mothurOut("Done."); mothurOutEndLine();
92                 
93                 int numSeqs = querySeqs.size();
94                 
95                 IS.resize(numSeqs);
96                 
97                 //break up file if needed
98                 int linesPerProcess = numSeqs / processors ;
99                 
100                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
101                         //find breakup of sequences for all times we will Parallelize
102                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
103                         else {
104                                 //fill line pairs
105                                 for (int i = 0; i < (processors-1); i++) {                      
106                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
107                                 }
108                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
109                                 int i = processors - 1;
110                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
111                         }
112                         
113                         //find breakup of templatefile for quantiles
114                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
115                         else { 
116                                 for (int i = 0; i < processors; i++) {
117                                         templateLines.push_back(new linePair());
118                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
119                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
120                                 }
121                         }
122                 #else
123                         lines.push_back(new linePair(0, numSeqs));
124                         templateLines.push_back(new linePair(0, templateSeqs.size()));
125                 #endif
126         
127         
128                 distCalc = new ignoreGaps();
129                 
130                 //find window breaks
131                 windowBreak = findWindows();
132 //for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t';  }
133 //cout << endl;
134
135                 mothurOut("Finding the IS values for your sequences..."); cout.flush();
136                 if (processors == 1) { 
137                         IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
138                 }else {         IS = createProcessesIS(querySeqs, lines);               }
139                 mothurOut("Done."); mothurOutEndLine();
140                 
141                 //quantiles are used to determine whether the de values found indicate a chimera
142                 if (quanfile != "") {  quantile = readQuantiles();  }
143                 else {
144                         
145                         mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
146                         //get IS quantiles as a reference to these IS scores
147                         //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
148                         if (processors == 1) { 
149                                 templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
150                         }else {         templateIS = createProcessesIS(templateSeqs, templateLines);            }
151 //cout << "here" << endl;                       
152                         ofstream out4;
153                         string o;
154                         
155                         o = getRootName(templateFile) + "alignedsimilarity.quan";
156                         
157                         openOutputFile(o, out4);
158                         
159                         //adjust quantiles
160                         //construct table
161                         quantile.resize(100);
162                 
163                         for (int i = 0; i < templateIS.size(); i++) {
164                                 
165                                 if (templateIS[i].size() != 0) {
166                                         int j = 0; float score = -1000;
167                                         //find highest IS score
168         //cout << templateIS[i].size() << endl;
169                                         for (int k = 0; k < templateIS[i].size(); k++) {
170                                                 if (templateIS[i][k].score > score) {
171                                                         score = templateIS[i][k].score;
172                                                         j = k;
173                                                 }
174                                         }
175         //cout << j << endl;            
176                                         //find similarity of parents
177                                         distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
178                                         float dist = distCalc->getDist();
179                         
180                                         //convert to similarity
181                                         dist = (1 - dist) * 100;
182         //      cout << dist << endl;   
183                                         int index = ceil(dist);
184         //      cout << "index = " << index << endl;    
185                                         if (index == 0) {       index = 1;      }
186                                         quantile[index-1].push_back(templateIS[i][j].score);
187                                 }
188                         }
189                 
190                 
191                         for (int i = 0; i < quantile.size(); i++) {
192                                 vector<float> temp;
193                                 
194                                 if (quantile[i].size() == 0) {
195                                         //in case this is not a distance found in your template files
196                                         for (int g = 0; g < 6; g++) {
197                                                 temp.push_back(0.0);
198                                         }
199                                 }else{
200                                         
201                                         sort(quantile[i].begin(), quantile[i].end());
202                                         
203                                         //save 10%
204                                         temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
205                                         //save 25%
206                                         temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
207                                         //save 50%
208                                         temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
209                                         //save 75%
210                                         temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
211                                         //save 95%
212                                         temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
213                                         //save 99%
214                                         temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
215                                         
216                                 }
217                                 
218                                 //output quan value
219                                 out4 << i+1 << '\t';                            
220                                 for (int u = 0; u < temp.size(); u++) {   out4 << temp[u] << '\t'; }
221                                 out4 << endl;
222                                 
223                                 quantile[i] = temp;
224                                 
225                         }
226                         
227                         out4.close();
228                         
229                         mothurOut("Done."); mothurOutEndLine();
230
231                 }
232                 
233                 //free memory
234                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
235                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
236                         
237         }
238         catch(exception& e) {
239                 errorOut(e, "AlignSim", "getChimeras");
240                 exit(1);
241         }
242 }
243 //***************************************************************************************************************
244 vector<int> AlignSim::findWindows() {
245         try {
246                 
247                 vector<int> win; 
248                 
249                 if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
250                 
251                 for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
252
253                 return win;
254         
255         }
256         catch(exception& e) {
257                 errorOut(e, "AlignSim", "findWindows");
258                 exit(1);
259         }
260 }
261
262 //***************************************************************************************************************
263 vector< vector<sim>  > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
264         try {
265                 
266                 vector< vector<sim> >  isValues;
267                 isValues.resize(seqs.size());
268                 
269                 //for each sequence
270                 for(int i = start; i < end; i++){
271                         
272                         vector<sim> temp;  temp.resize(windowBreak.size());
273                         
274                         mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
275                         
276                         //for each window
277                         for (int m = 0; m < windowBreak.size(); m++) {
278                                 
279                                 vector<Sequence*> closest;  //left, right, overall
280                                 vector<int> similarity; //left+right and overall
281                                 
282                                 //find closest left, closest right and closest overall
283                                 closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
284                                 
285                                 int totalNumBases = seqs[i]->getUnaligned().length();
286                                 
287                                 //IS = left+right-overall
288                                 float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
289                                 
290                                 ///save IS, leftparent, rightparent, breakpoint
291                                 temp[m].leftParent = closest[0];
292                                 temp[m].rightParent = closest[1];
293                                 temp[m].score = is;
294                                 temp[m].midpoint = windowBreak[m];
295 //cout << is << '\t';
296                         }
297 //cout << endl;                 
298                         isValues[i] = temp;
299                 
300                 }
301                 
302                 return isValues;
303         
304         }
305         catch(exception& e) {
306                 errorOut(e, "AlignSim", "findIS");
307                 exit(1);
308         }
309 }
310 //***************************************************************************************************************
311 vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
312         try{
313         
314                 vector<Sequence*> closest;
315                 
316                 Sequence query, queryLeft, queryRight;
317                 string frag = seq->getAligned();
318                 string fragLeft = frag.substr(0, breakpoint);
319                 string fragRight = frag.substr(breakpoint, frag.length());
320                 
321                 //get pieces
322                 query = *(seq);  
323                 queryLeft = *(seq);  
324                 queryRight = *(seq);
325                 queryLeft.setAligned(fragLeft);
326                 queryRight.setAligned(fragRight);
327                 
328                 //initialize
329                 Sequence* overall = templateSeqs[0];
330                 Sequence* left = templateSeqs[0];
331                 Sequence* right = templateSeqs[0];
332                 
333                 float smallestOverall, smallestLeft, smallestRight;
334                 smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
335                 
336                 //go through the templateSeqs and search for the closest
337                 for(int j = 0; j < templateSeqs.size(); j++){
338                         
339                         //so you don't pick yourself
340                         if (j != i) {
341                                 Sequence temp, tempLeft, tempRight;
342                                 string fragTemp = templateSeqs[j]->getAligned();
343                                 string fragTempLeft = fragTemp.substr(0, breakpoint);
344                                 string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
345                                 
346                                 //get pieces
347                                 temp = *(templateSeqs[j]); 
348                                 tempLeft = *(templateSeqs[j]); 
349                                 tempRight = *(templateSeqs[j]);
350                                 tempLeft.setAligned(fragTempLeft);
351                                 tempRight.setAligned(fragTempRight);
352                                 
353                                 //find overall dist
354                                 distCalc->calcDist(query, temp);
355                                 float dist = distCalc->getDist();       
356                                 
357                                 if (dist < smallestOverall) { 
358                                         overall = templateSeqs[j];
359                                         smallestOverall = dist;
360                                 }
361                                 
362                                 //find left dist
363                                 distCalc->calcDist(queryLeft, tempLeft);
364                                 dist = distCalc->getDist();
365                                 
366                                 if (dist < smallestLeft) { 
367                                         left = templateSeqs[j];
368                                         smallestLeft = dist;
369                                 }
370                                 
371                                 //find left dist
372                                 distCalc->calcDist(queryRight, tempRight);
373                                 dist = distCalc->getDist();
374                                 
375                                 if (dist < smallestRight) { 
376                                         right = templateSeqs[j];
377                                         smallestRight = dist;
378                                 }
379                         }
380
381                 }
382                         
383                 closest.push_back(left);
384                 closest.push_back(right);
385                 closest.push_back(overall);
386                 
387                 //fill sim with number of matched bases
388                 Sequence tempL = *(left);
389                 string tempLFrag = tempL.getAligned();
390                 tempL.setAligned(tempLFrag.substr(0, breakpoint));
391                 
392                 int bothMatches = findNumMatchedBases(queryLeft, tempL);
393                 sim.push_back(bothMatches);
394
395                 Sequence tempR = *(right);
396                 string tempRFrag = tempR.getAligned();
397                 tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
398                 
399                 bothMatches = findNumMatchedBases(queryRight, tempR);
400                 sim.push_back(bothMatches);
401
402                 bothMatches = findNumMatchedBases(query, *(overall));
403                 sim.push_back(bothMatches);
404                         
405                 return closest;
406
407
408         }
409         catch(exception& e) {
410                 errorOut(e, "AlignSim", "findClosestSides");
411                 exit(1);
412         }
413 }
414 //***************************************************************************************************************
415
416 int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
417         try{
418         
419                 int num = 0;
420                 
421                 string query = seq.getAligned();
422                 string subject = temp.getAligned();
423 //cout << seq.getName() << endl << endl;
424 //cout << temp.getName()  << endl << endl;
425                 
426                 for (int i = 0; i < query.length(); i++) {
427                         //count matches if query[i] is a base and subject is same bases or gap
428                         if (isalpha(query[i])) {
429                                 if(!isalpha(subject[i])) {  num++;  } 
430                                 else if (query[i] == subject[i])  { num++;  }
431                                 else {  }
432                         }
433                 }
434 //cout << "num = " << num << endl;              
435                 return num;
436         }
437         catch(exception& e) {
438                 errorOut(e, "AlignSim", "findNumMatchedBases");
439                 exit(1);
440         }
441 }
442
443 /**************************************************************************************************/
444 vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
445         try {
446         vector< vector<sim> > localIs; localIs.resize(seqs.size());
447         
448 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
449                 int process = 0;
450                 vector<int> processIDS;
451                 
452                 //loop through and create all the processes you want
453                 while (process != processors) {
454                         int pid = fork();
455                         
456                         if (pid > 0) {
457                                 processIDS.push_back(pid);  
458                                 process++;
459                         }else if (pid == 0){
460                                 
461                                 mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
462                                 localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
463                                 mothurOut("Done finding IS values for sequences " +  toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
464                                 
465                                 //write out data to file so parent can read it
466                                 ofstream out;
467                                 string s = toString(getpid()) + ".temp";
468                                 openOutputFile(s, out);
469                                 
470                                 //output pairs
471                                 for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
472                                          out << localIs[i].size() << endl;
473                                          for (int j = 0; j < localIs[i].size(); j++) {
474                                                 localIs[i][j].leftParent->printSequence(out);
475                                                 localIs[i][j].rightParent->printSequence(out);
476                                                 out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
477                                          }
478                                 }
479                                 out.close();
480                                 
481                                 exit(0);
482                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
483                 }
484                 
485                 //force parent to wait until all the processes are done
486                 for (int i=0;i<processors;i++) { 
487                         int temp = processIDS[i];
488                         wait(&temp);
489                 }
490                 
491                 //get data created by processes
492                 for (int i=0;i<processors;i++) { 
493                         ifstream in;
494                         string s = toString(processIDS[i]) + ".temp";
495                         openInputFile(s, in);
496                         
497                         //get pairs
498                         for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
499                                 int size;
500                                 in >> size;
501                                 gobble(in);
502                                 
503                                 vector<sim> tempVector;
504                                 
505                                 for (int j = 0; j < size; j++) {
506                                 
507                                         sim temp;
508                                         
509                                         temp.leftParent = new Sequence(in);
510                                         gobble(in);
511         //temp.leftParent->printSequence(cout);                         
512                                         temp.rightParent = new Sequence(in);
513                                         gobble(in);
514 //temp.rightParent->printSequence(cout);
515                                         string throwaway;
516                                         in >> throwaway >> temp.score >> temp.midpoint;
517         //cout << temp.score << '\t' << temp.midpoint << endl;                          
518                                         gobble(in);
519                                         
520                                         tempVector.push_back(temp);
521                                 }
522                                 
523                                 localIs[k] = tempVector;
524                         }
525                         
526                         in.close();
527                         remove(s.c_str());
528                 }
529                         
530         
531 #else
532                 localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
533 #endif  
534
535                 return localIs;
536         }
537         catch(exception& e) {
538                 errorOut(e, "AlignSim", "createProcessesIS");
539                 exit(1);
540         }
541 }
542
543 //***************************************************************************************************************