]> git.donarmstrong.com Git - mothur.git/blob - refchimeratest.cpp
changed random forest output filename
[mothur.git] / refchimeratest.cpp
1 /*
2  *  refchimeratest.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 1/31/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "refchimeratest.h"
11 #include "myPerseus.h"
12 #include "mothur.h"
13
14 int MAXINT = numeric_limits<int>::max();
15
16 //***************************************************************************************************************
17
18 RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, bool aligned) : aligned(aligned){
19
20         m = MothurOut::getInstance();
21
22         numRefSeqs = refs.size();
23
24         referenceSeqs.resize(numRefSeqs);
25         referenceNames.resize(numRefSeqs);
26         for(int i=0;i<numRefSeqs;i++){
27                 if (aligned) { referenceSeqs[i] = refs[i].getAligned(); }
28         else { referenceSeqs[i] = refs[i].getUnaligned(); }
29                 referenceNames[i] = refs[i].getName();
30         }
31         
32         alignLength = referenceSeqs[0].length();
33
34 }
35 //***************************************************************************************************************
36
37 int RefChimeraTest::printHeader(ofstream& chimeraReportFile){
38         try {
39                 chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents" << endl;
40                 return 0; 
41         }catch(exception& e) {
42                 m->errorOut(e, "RefChimeraTest", "execute");
43                 exit(1);
44         }
45 }
46
47 //***************************************************************************************************************
48
49 int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
50
51     int numParents = -1;
52     
53     if(aligned){
54         numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile);
55     }
56     else{
57         numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile);
58     }
59     
60     return numParents;
61     
62 }
63  
64 //***************************************************************************************************************
65
66 int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
67         
68         vector<vector<int> > left(numRefSeqs);
69         vector<int> singleLeft, bestLeft;
70         vector<int> singleRight, bestRight;
71         
72         vector<vector<int> > right(numRefSeqs);
73         for(int i=0;i<numRefSeqs;i++){
74                 left[i].assign(alignLength, 0);
75         }
76         right = left;
77         
78         int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch);
79     
80     
81         
82         int leftParentBi, rightParentBi, breakPointBi;
83         int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
84         
85         int nMera = 0;
86         string chimeraRefSeq = "";
87         
88         if(bestSequenceMismatch - minMismatchToChimera >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
89                 nMera = 2;
90                 chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
91         }
92         else{
93                 nMera = 1;
94                 chimeraRefSeq = referenceSeqs[bestMatch];
95         }
96         
97         bestRefAlignment = chimeraRefSeq;
98     bestQueryAlignment = querySeq;
99     
100         double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment);
101         
102         chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
103         chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
104         chimeraReportFile << minMismatchToChimera << '\t';
105     chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
106                 
107         return nMera;
108 }
109
110 //***************************************************************************************************************
111
112 int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
113         
114     int nMera = 0;
115     
116     int seqLength = querySeq.length();
117     
118     vector<string> queryAlign(numRefSeqs);
119     vector<string> refAlign(numRefSeqs);
120
121     vector<vector<int> > leftDiffs(numRefSeqs);
122     vector<vector<int> > rightDiffs(numRefSeqs);
123     vector<vector<int> > leftMaps(numRefSeqs);
124     vector<vector<int> > rightMaps(numRefSeqs);
125     
126     int bestRefIndex = -1;
127     int bestRefDiffs = numeric_limits<int>::max();
128     double bestRefLength = 0;
129     
130     for(int i=0;i<numRefSeqs;i++){
131         double length = 0;
132         double diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
133         if(diffs < bestRefDiffs){
134             bestRefDiffs = diffs;
135             bestRefLength = length;
136             bestRefIndex = i;
137         }
138     }
139
140     if(bestRefDiffs >= 3){
141         for(int i=0;i<numRefSeqs;i++){
142             leftDiffs[i].assign(seqLength, 0);
143             rightDiffs[i].assign(seqLength, 0);
144             leftMaps[i].assign(seqLength, 0);
145             rightMaps[i].assign(seqLength, 0);
146             
147             getUnalignedDiffs(queryAlign[i], refAlign[i], leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
148         }
149     
150         vector<int> singleLeft(seqLength, numeric_limits<int>::max());
151         vector<int> bestLeft(seqLength, -1);
152         
153         for(int l=0;l<seqLength;l++){
154             
155             for(int i=0;i<numRefSeqs;i++){            
156                 if(leftDiffs[i][l] < singleLeft[l]){
157                     singleLeft[l] = leftDiffs[i][l];
158                     bestLeft[l] = i;
159                 }
160             }
161         }
162         
163         vector<int> singleRight(seqLength, numeric_limits<int>::max());
164         vector<int> bestRight(seqLength, -1);
165         
166         for(int l=0;l<seqLength;l++){
167                 
168             for(int i=0;i<numRefSeqs;i++){
169                 if(rightDiffs[i][l] < singleRight[l]){
170                     singleRight[l] = rightDiffs[i][l];
171                     bestRight[l] = i;
172                 }
173             }
174         }
175         
176         int bestChimeraMismatches = numeric_limits<int>::max();
177         int leftParent = 0;
178         int rightParent = 0;
179         int breakPoint = 0;
180         
181         for(int l=0;l<seqLength-1;l++){
182             
183             int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
184             if(chimera < bestChimeraMismatches){
185                 bestChimeraMismatches = chimera;
186                 breakPoint = l;
187                 leftParent = bestLeft[l];
188                 rightParent = bestRight[seqLength - l - 2];
189             }
190         }
191         
192         string reference;
193         
194         if(bestRefDiffs - bestChimeraMismatches >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
195             nMera = 2;
196             
197             int breakLeft = leftMaps[leftParent][breakPoint];
198             int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
199             
200             string left = refAlign[leftParent];
201             string right = refAlign[rightParent];
202             
203             for(int i=0;i<=breakLeft;i++){
204                 
205                 if (m->control_pressed) { return 0; }
206                 
207                 if(left[i] != '-' && left[i] != '.'){
208                     reference += left[i];
209                 }
210             }
211             
212             
213             for(int i=breakRight;i<right.length();i++){
214                 
215                 if (m->control_pressed) { return 0; }
216                 
217                 if(right[i] != '-' && right[i] != '.'){
218                     reference += right[i];
219                 }
220             }
221
222         }
223         else{
224             nMera = 1;
225             reference = referenceSeqs[bestRefIndex];
226         }
227
228         double alignLength;
229         double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength);
230         double finalDistance = finalDiffs / alignLength;
231
232         chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
233         chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t';
234         chimeraReportFile << bestChimeraMismatches << '\t';
235         chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl;
236     }
237     else{
238         bestQueryAlignment = queryAlign[bestRefIndex];
239         bestRefAlignment = refAlign[bestRefIndex];
240         nMera = 1;
241         
242         chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
243         chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl;
244     } 
245     
246     bestMatch = bestRefIndex;
247     return nMera;
248 }
249
250 /**************************************************************************************************/
251
252 double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){
253     
254     
255     try {
256                 double GAP = -5;
257                 double MATCH = 1;
258                 double MISMATCH = -1;
259                 
260                 int queryLength = query.length();
261                 int refLength = reference.length();
262                 
263                 vector<vector<double> > alignMatrix(queryLength + 1);
264                 vector<vector<char> > alignMoves(queryLength + 1);
265                 
266                 for(int i=0;i<=queryLength;i++){
267                         if (m->control_pressed) { return 0; }
268                         alignMatrix[i].resize(refLength + 1, 0);
269                         alignMoves[i].resize(refLength + 1, 'x');
270                 }
271                 
272                 for(int i=0;i<=queryLength;i++){
273                         if (m->control_pressed) { return 0; }
274                         alignMatrix[i][0] = 0;//GAP * i;
275                         alignMoves[i][0] = 'u';
276                 }
277                 
278                 for(int i=0;i<=refLength;i++){
279                         if (m->control_pressed) { return 0; }
280                         alignMatrix[0][i] = 0;//GAP * i;
281                         alignMoves[0][i] = 'l';
282                 }
283                         
284                 for(int i=1;i<=queryLength;i++){
285                         
286                         if (m->control_pressed) { return 0; }
287                         
288                         for(int j=1;j<=refLength;j++){
289                                 
290                                 double nogapScore;              
291                                 if(query[i-1] == reference[j-1]){       nogapScore = alignMatrix[i-1][j-1] + MATCH;             }
292                                 else                                                    {       nogapScore = alignMatrix[i-1][j-1] + MISMATCH;  }
293                                 
294                                 double leftScore;
295                                 if(i == queryLength)                    {       leftScore = alignMatrix[i][j-1];                                }
296                                 else                                                    {       leftScore = alignMatrix[i][j-1] + GAP;                  }
297                                 
298                                 
299                                 double upScore;
300                                 if(j == refLength)                              {       upScore = alignMatrix[i-1][j];                                  }
301                                 else                                                    {       upScore = alignMatrix[i-1][j] + GAP;                    }
302                                 
303                                 if(nogapScore > leftScore){
304                                         if(nogapScore > upScore){
305                                                 alignMoves[i][j] = 'd';
306                                                 alignMatrix[i][j] = nogapScore;
307                                         }
308                                         else{
309                                                 alignMoves[i][j] = 'u';
310                                                 alignMatrix[i][j] = upScore;
311                                         }
312                                 }
313                                 else{
314                                         if(leftScore > upScore){
315                                                 alignMoves[i][j] = 'l';
316                                                 alignMatrix[i][j] = leftScore;
317                                         }
318                                         else{
319                                                 alignMoves[i][j] = 'u';
320                                                 alignMatrix[i][j] = upScore;
321                                         }
322                                 }
323                         }
324                 }
325                         
326                 int end = refLength - 1;
327         int maxRow = 0;
328         double maxRowValue = -2147483647;
329         for(int i=0;i<queryLength;i++){
330             if(alignMatrix[i][end] > maxRowValue){
331                 maxRow = i;
332                 maxRowValue = alignMatrix[i][end];
333             }
334         }
335         
336         end = queryLength - 1;
337         int maxColumn = 0;
338         double maxColumnValue = -2147483647;
339
340         for(int j=0;j<refLength;j++){
341             if(alignMatrix[end][j] > maxColumnValue){
342                 maxColumn = j;
343                 maxColumnValue = alignMatrix[end][j];
344             }
345         }
346
347         int row = queryLength-1;
348         int column = refLength-1;
349         
350         if(maxColumn == column && maxRow == row){}      //      if the max values are the lower right corner, then we're good
351         else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){
352             for(int i=maxRow+1;i<queryLength;i++){                      //      decide whether sequence A or B needs the gaps at the end either set 
353                 alignMoves[i][column] = 'u';//  the pointer upwards or...
354             }
355             
356         }
357         else {
358             for(int i=maxColumn+1;i<refLength;i++){
359                 alignMoves[row][i] = 'l';       //      ...to the left
360             }
361         }
362
363         int i = queryLength;
364                 int j = refLength;
365
366         
367                 qAlign = "";
368                 rAlign = "";
369         
370                 int diffs = 0;
371                 length = 0;
372
373                 while(i > 0 && j > 0){
374                         
375                         if (m->control_pressed) { return 0; }
376                         
377                         if(alignMoves[i][j] == 'd'){
378                                 qAlign = query[i-1] + qAlign;
379                                 rAlign = reference[j-1] + rAlign;
380                 
381                                 if(query[i-1] != reference[j-1]){       diffs++;        }
382                                 length++;
383                                 
384                                 i--;
385                                 j--;
386                         }
387                         else if(alignMoves[i][j] == 'u'){
388                                 qAlign = query[i-1] + qAlign;
389                                 
390                                 if(j != refLength)      {       rAlign = '-' + rAlign;  diffs++;        length++;       }
391                                 else                            {       rAlign = '.' + rAlign;  }
392                                 i--;
393                         }
394                         else if(alignMoves[i][j] == 'l'){
395                                 rAlign = reference[j-1] + rAlign;
396                                 
397                                 if(i != queryLength){   qAlign = '-' + qAlign;  diffs++;        length++;       }
398                                 else                            {       qAlign = '.' + qAlign;  }
399                                 j--;
400                         }
401                 }
402
403         if(i>0){
404             qAlign = query.substr(0, i) + qAlign;
405             rAlign = string(i, '.') + rAlign;
406         }
407                 else if(j>0){
408             qAlign = string(j, '.') + qAlign;
409             rAlign = reference.substr(0, j) + rAlign;
410         }
411
412         
413                 return diffs;
414         }
415         catch(exception& e) {
416                 m->errorOut(e, "RefChimeraTest", "alignQueryToReferences");
417                 exit(1);
418         }
419 }
420
421 /**************************************************************************************************/
422
423 int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
424         try {
425                 int alignLength = qAlign.length();
426                 
427                 int lDiffs = 0;
428                 int lCount = 0;
429                 for(int l=0;l<alignLength;l++){
430                         
431                         if (m->control_pressed) { return 0; }
432                         
433                         if(qAlign[l] == '-'){
434                                 lDiffs++;               
435                         }
436                         else if(qAlign[l] != '.'){
437                                 
438                                 if(rAlign[l] == '-'){
439                                         lDiffs++;
440                                 }
441                                 else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
442                                         lDiffs++;
443                                 }
444                                 leftDiffs[lCount] = lDiffs;
445                                 leftMap[lCount] = l;
446                                 
447                                 lCount++;
448                         }                       
449                 }
450
451                 int rDiffs = 0;
452                 int rCount = 0;
453                 for(int l=alignLength-1;l>=0;l--){
454                         
455                         if (m->control_pressed) { return 0; }
456                         
457                         if(qAlign[l] == '-'){
458                                 rDiffs++;               
459                         }
460                         else if(qAlign[l] != '.'){
461                                 
462                                 if(rAlign[l] == '-'){
463                                         rDiffs++;
464                                 }
465                                 else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
466                                         rDiffs++;
467                                 }
468                                 
469                                 rightDiffs[rCount] = rDiffs;
470                                 rightMap[rCount] = l;
471                                 rCount++;
472                         }
473                         
474                 }
475
476                 return 0;
477         }
478         catch(exception& e) {
479                 m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs");
480                 exit(1);
481         }
482 }
483
484 /**************************************************************************************************/
485
486 int RefChimeraTest::getAlignedMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
487         
488         int bestSequenceMismatch = MAXINT;
489         
490         for(int i=0;i<numRefSeqs;i++){
491                 
492                 int lDiffs = 0;
493                 
494                 for(int l=0;l<alignLength;l++){
495                         if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
496                                 lDiffs++;
497                         }
498                         left[i][l] = lDiffs;
499                 }
500                 
501                 int rDiffs = 0;
502                 int index = 0;
503                 for(int l=alignLength-1;l>=0;l--){
504                         if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
505                                 rDiffs++;
506                         }                       
507                         right[i][index++] = rDiffs;
508                 }
509                 
510                 if(lDiffs < bestSequenceMismatch){
511                         bestSequenceMismatch = lDiffs;
512                         bestRefSeq = i;
513                 }
514         }
515         return bestSequenceMismatch;
516 }
517
518 /**************************************************************************************************/
519
520 int RefChimeraTest::getChimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& rightParent, int& breakPoint, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
521         
522         singleLeft.resize(alignLength, MAXINT);
523         bestLeft.resize(alignLength, -1);
524         
525         for(int l=0;l<alignLength;l++){
526                 for(int i=0;i<numRefSeqs;i++){
527                         if(left[i][l] <= singleLeft[l]){
528                                 singleLeft[l] = left[i][l];
529                                 bestLeft[l] = i;
530                         }
531                 }
532         }
533         
534         singleRight.resize(alignLength, MAXINT);
535         bestRight.resize(alignLength, -1);
536         
537         for(int l=0;l<alignLength;l++){
538                 for(int i=0;i<numRefSeqs;i++){
539                         if(right[i][l] <= singleRight[l]){
540                                 singleRight[l] = right[i][l];
541                                 bestRight[l] = i;
542                         }
543                 }
544         }
545         
546         int bestChimeraMismatches = MAXINT;
547         leftParent = -1;
548         rightParent = -1;
549         breakPoint = -1;
550         
551         for(int l=0;l<alignLength;l++){
552                 int chimera = singleLeft[l] + singleRight[alignLength - l - 1];
553                 if(chimera < bestChimeraMismatches){
554                         bestChimeraMismatches = chimera;
555                         breakPoint = l;
556                         leftParent = bestLeft[l];
557                         rightParent = bestRight[alignLength - l - 1];
558                 }
559         }
560         
561         return bestChimeraMismatches;
562 }
563
564 /**************************************************************************************************/
565
566 int RefChimeraTest::getTrimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& middleParent, int& rightParent, int& breakPointA, int& breakPointB, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
567         
568         int bestTrimeraMismatches = MAXINT;
569         
570         leftParent = -1;
571         middleParent = -1;
572         rightParent = -1;
573         
574         breakPointA = -1;
575         breakPointB = -1;
576         
577         vector<vector<int> > minDelta(alignLength);
578         vector<vector<int> > minDeltaSeq(alignLength);
579         
580         for(int i=0;i<alignLength;i++){
581                 minDelta[i].assign(alignLength, MAXINT);
582                 minDeltaSeq[i].assign(alignLength, -1);
583         }
584         
585         for(int x=0;x<alignLength;x++){
586                 for(int y=x;y<alignLength-1;y++){
587                         
588                         for(int i=0;i<numRefSeqs;i++){
589                                 int delta = left[i][y] - left[i][x];
590                                 
591                                 if(delta <= minDelta[x][y]){
592                                         minDelta[x][y] = delta;
593                                         minDeltaSeq[x][y] = i;                                  
594                                 }                               
595                         }
596                         minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
597                         
598                         if(minDelta[x][y] < bestTrimeraMismatches){
599                                 bestTrimeraMismatches = minDelta[x][y];
600                                 
601                                 breakPointA = x;
602                                 breakPointB = y;
603                                 
604                                 leftParent = bestLeft[x];
605                                 middleParent = minDeltaSeq[x][y];
606                                 rightParent = bestRight[alignLength - y - 2];                           
607                         }
608                 }               
609         }
610         return bestTrimeraMismatches;
611 }
612
613 /**************************************************************************************************/
614
615 string RefChimeraTest::stitchBimera(int leftParent, int rightParent, int breakPoint){
616         
617         string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPoint) + referenceSeqs[rightParent].substr(breakPoint);
618         return chimeraRefSeq;
619         
620 }
621
622 /**************************************************************************************************/
623
624 string RefChimeraTest::stitchTrimera(int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB){
625         
626         string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPointA) + referenceSeqs[middleParent].substr(breakPointA, breakPointB-breakPointA) + referenceSeqs[rightParent].substr(breakPointB);
627         
628         return chimeraRefSeq;
629 }
630
631 /**************************************************************************************************/
632
633 double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq){
634         
635         int match = 0;
636         int mismatch = 0;
637         
638         for(int i=0;i<alignLength;i++){
639 //              if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
640                 if(chimeraRefSeq[i] != '.' && querySeq[i] != '.'){
641                         if(querySeq[i] == '-' && chimeraRefSeq[i] == '-' && chimeraRefSeq[i] != 'N'){   /*      do nothing      */      }
642                         else if(querySeq[i] == chimeraRefSeq[i]){
643                                 match++;
644                         }
645                         else{
646                                 mismatch++;
647                         }                       
648                 }
649         }
650         
651         return (double)mismatch / (double)(mismatch + match);   
652 }
653
654 //***************************************************************************************************************
655
656 int RefChimeraTest::getClosestRefIndex(){
657
658         return bestMatch;
659         
660 }
661
662 //***************************************************************************************************************
663
664 string RefChimeraTest::getQueryAlignment(){
665     
666         return bestQueryAlignment;
667         
668 }
669
670 //***************************************************************************************************************
671
672 string RefChimeraTest::getClosestRefAlignment(){
673     
674         return bestRefAlignment;
675         
676 }
677
678 //***************************************************************************************************************