]> git.donarmstrong.com Git - mothur.git/blob - refchimeratest.cpp
test
[mothur.git] / refchimeratest.cpp
1 /*
2  *  refchimeratest.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 1/31/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "refchimeratest.h"
11 #include "myPerseus.h"
12 #include "mothur.h"
13
14 int MAXINT = numeric_limits<int>::max();
15
16 //***************************************************************************************************************
17
18 RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, bool aligned) : aligned(aligned){
19
20         m = MothurOut::getInstance();
21
22         numRefSeqs = refs.size();
23
24         referenceSeqs.resize(numRefSeqs);
25         referenceNames.resize(numRefSeqs);
26         for(int i=0;i<numRefSeqs;i++){
27                 referenceSeqs[i] = refs[i].getAligned();
28                 referenceNames[i] = refs[i].getName();
29         }
30         
31         alignLength = referenceSeqs[0].length();
32
33 }
34 //***************************************************************************************************************
35
36 int RefChimeraTest::printHeader(ofstream& chimeraReportFile){
37         try {
38                 chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents" << endl;
39                 return 0; 
40         }catch(exception& e) {
41                 m->errorOut(e, "RefChimeraTest", "execute");
42                 exit(1);
43         }
44 }
45
46 //***************************************************************************************************************
47
48 int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
49
50     int numParents = -1;
51     
52     if(aligned){
53         numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile);
54     }
55     else{
56         numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile);
57     }
58     
59     return numParents;
60     
61 }
62  
63 //***************************************************************************************************************
64
65 int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
66         
67         vector<vector<int> > left(numRefSeqs);
68         vector<int> singleLeft, bestLeft;
69         vector<int> singleRight, bestRight;
70         
71         vector<vector<int> > right(numRefSeqs);
72         for(int i=0;i<numRefSeqs;i++){
73                 left[i].assign(alignLength, 0);
74         }
75         right = left;
76         
77         int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch);
78     
79     
80         
81         int leftParentBi, rightParentBi, breakPointBi;
82         int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
83         
84         int nMera = 0;
85         string chimeraRefSeq = "";
86         
87         if(bestSequenceMismatch - minMismatchToChimera >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
88                 nMera = 2;
89                 chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
90         }
91         else{
92                 nMera = 1;
93                 chimeraRefSeq = referenceSeqs[bestMatch];
94         }
95         
96         bestRefAlignment = chimeraRefSeq;
97     bestQueryAlignment = querySeq;
98     
99         double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment);
100         
101         chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
102         chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
103         chimeraReportFile << minMismatchToChimera << '\t';
104     chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
105                 
106         return nMera;
107 }
108
109 //***************************************************************************************************************
110
111 int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
112         
113     int nMera = 0;
114     
115     int seqLength = querySeq.length();
116     
117     vector<string> queryAlign(numRefSeqs);
118     vector<string> refAlign(numRefSeqs);
119
120     vector<vector<int> > leftDiffs(numRefSeqs, 0);
121     vector<vector<int> > rightDiffs(numRefSeqs, 0);
122     vector<vector<int> > leftMaps(numRefSeqs, 0);
123     vector<vector<int> > rightMaps(numRefSeqs, 0);
124     
125     int bestRefIndex = -1;
126     int bestRefDiffs = numeric_limits<int>::max();
127     double bestRefLength = 0;
128     
129     for(int i=0;i<numRefSeqs;i++){
130         double length = 0;
131         int diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
132         if(diffs < bestRefDiffs){
133             bestRefDiffs = diffs;
134             bestRefLength = length;
135             bestRefIndex = i;
136         }
137     }
138
139     if(bestRefDiffs >= 3){
140         for(int i=0;i<numRefSeqs;i++){
141             leftDiffs[i].assign(seqLength, 0);
142             rightDiffs[i].assign(seqLength, 0);
143             leftMaps[i].assign(seqLength, 0);
144             rightMaps[i].assign(seqLength, 0);
145             
146             getUnalignedDiffs(queryAlign[i], refAlign[i], leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
147         }
148     
149         vector<int> singleLeft(seqLength, numeric_limits<int>::max());
150         vector<int> bestLeft(seqLength, -1);
151         
152         for(int l=0;l<seqLength;l++){
153             
154             for(int i=0;i<numRefSeqs;i++){            
155                 if(leftDiffs[i][l] < singleLeft[l]){
156                     singleLeft[l] = leftDiffs[i][l];
157                     bestLeft[l] = i;
158                 }
159             }
160         }
161         
162         vector<int> singleRight(seqLength, numeric_limits<int>::max());
163         vector<int> bestRight(seqLength, -1);
164         
165         for(int l=0;l<seqLength;l++){
166                 
167             for(int i=0;i<numRefSeqs;i++){
168                 if(rightDiffs[i][l] < singleRight[l]){
169                     singleRight[l] = rightDiffs[i][l];
170                     bestRight[l] = i;
171                 }
172             }
173         }
174         
175         int bestChimeraMismatches = numeric_limits<int>::max();
176         int leftParent = 0;
177         int rightParent = 0;
178         int breakPoint = 0;
179         
180         for(int l=0;l<seqLength-1;l++){
181             
182             int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
183             if(chimera < bestChimeraMismatches){
184                 bestChimeraMismatches = chimera;
185                 breakPoint = l;
186                 leftParent = bestLeft[l];
187                 rightParent = bestRight[seqLength - l - 2];
188             }
189         }
190         
191         string reference;
192         
193         if(bestRefDiffs - bestChimeraMismatches >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
194             nMera = 2;
195             
196             int breakLeft = leftMaps[leftParent][breakPoint];
197             int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
198             
199             string left = refAlign[leftParent];
200             string right = refAlign[rightParent];
201             
202             for(int i=0;i<=breakLeft;i++){
203                 
204                 if (m->control_pressed) { return 0; }
205                 
206                 if(left[i] != '-' && left[i] != '.'){
207                     reference += left[i];
208                 }
209             }
210             
211             
212             for(int i=breakRight;i<right.length();i++){
213                 
214                 if (m->control_pressed) { return 0; }
215                 
216                 if(right[i] != '-' && right[i] != '.'){
217                     reference += right[i];
218                 }
219             }
220
221         }
222         else{
223             nMera = 1;
224             reference = referenceSeqs[bestRefIndex];
225         }
226
227         double alignLength;
228         double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength);
229         double finalDistance = finalDiffs / alignLength;
230
231         chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
232         chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t';
233         chimeraReportFile << bestChimeraMismatches << '\t';
234         chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl;
235     }
236     else{
237         bestQueryAlignment = queryAlign[bestRefIndex];
238         bestRefAlignment = refAlign[bestRefIndex];
239         nMera = 1;
240         
241         chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
242         chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl;
243     } 
244     
245     bestMatch = bestRefIndex;
246     return nMera;
247 }
248
249 /**************************************************************************************************/
250
251 double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){
252     
253     
254     try {
255                 double GAP = -5;
256                 double MATCH = 1;
257                 double MISMATCH = -1;
258                 
259                 int queryLength = query.length();
260                 int refLength = reference.length();
261                 
262                 vector<vector<double> > alignMatrix(queryLength + 1);
263                 vector<vector<char> > alignMoves(queryLength + 1);
264                 
265                 for(int i=0;i<=queryLength;i++){
266                         if (m->control_pressed) { return 0; }
267                         alignMatrix[i].resize(refLength + 1, 0);
268                         alignMoves[i].resize(refLength + 1, 'x');
269                 }
270                 
271                 for(int i=0;i<=queryLength;i++){
272                         if (m->control_pressed) { return 0; }
273                         alignMatrix[i][0] = 0;//GAP * i;
274                         alignMoves[i][0] = 'u';
275                 }
276                 
277                 for(int i=0;i<=refLength;i++){
278                         if (m->control_pressed) { return 0; }
279                         alignMatrix[0][i] = 0;//GAP * i;
280                         alignMoves[0][i] = 'l';
281                 }
282                         
283                 for(int i=1;i<=queryLength;i++){
284                         
285                         if (m->control_pressed) { return 0; }
286                         
287                         for(int j=1;j<=refLength;j++){
288                                 
289                                 double nogapScore;              
290                                 if(query[i-1] == reference[j-1]){       nogapScore = alignMatrix[i-1][j-1] + MATCH;             }
291                                 else                                                    {       nogapScore = alignMatrix[i-1][j-1] + MISMATCH;  }
292                                 
293                                 double leftScore;
294                                 if(i == queryLength)                    {       leftScore = alignMatrix[i][j-1];                                }
295                                 else                                                    {       leftScore = alignMatrix[i][j-1] + GAP;                  }
296                                 
297                                 
298                                 double upScore;
299                                 if(j == refLength)                              {       upScore = alignMatrix[i-1][j];                                  }
300                                 else                                                    {       upScore = alignMatrix[i-1][j] + GAP;                    }
301                                 
302                                 if(nogapScore > leftScore){
303                                         if(nogapScore > upScore){
304                                                 alignMoves[i][j] = 'd';
305                                                 alignMatrix[i][j] = nogapScore;
306                                         }
307                                         else{
308                                                 alignMoves[i][j] = 'u';
309                                                 alignMatrix[i][j] = upScore;
310                                         }
311                                 }
312                                 else{
313                                         if(leftScore > upScore){
314                                                 alignMoves[i][j] = 'l';
315                                                 alignMatrix[i][j] = leftScore;
316                                         }
317                                         else{
318                                                 alignMoves[i][j] = 'u';
319                                                 alignMatrix[i][j] = upScore;
320                                         }
321                                 }
322                         }
323                 }
324                         
325                 int end = refLength - 1;
326         int maxRow = 0;
327         double maxRowValue = -100000000000;
328         for(int i=0;i<queryLength;i++){
329             if(alignMatrix[i][end] > maxRowValue){
330                 maxRow = i;
331                 maxRowValue = alignMatrix[i][end];
332             }
333         }
334         
335         end = queryLength - 1;
336         int maxColumn = 0;
337         double maxColumnValue = -100000000000;
338
339         for(int j=0;j<refLength;j++){
340             if(alignMatrix[end][j] > maxColumnValue){
341                 maxColumn = j;
342                 maxColumnValue = alignMatrix[end][j];
343             }
344         }
345
346         int row = queryLength-1;
347         int column = refLength-1;
348         
349         if(maxColumn == column && maxRow == row){}      //      if the max values are the lower right corner, then we're good
350         else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){
351             for(int i=maxRow+1;i<queryLength;i++){                      //      decide whether sequence A or B needs the gaps at the end either set 
352                 alignMoves[i][column] = 'u';//  the pointer upwards or...
353             }
354             
355         }
356         else {
357             for(int i=maxColumn+1;i<refLength;i++){
358                 alignMoves[row][i] = 'l';       //      ...to the left
359             }
360         }
361
362         int i = queryLength;
363                 int j = refLength;
364
365         
366                 qAlign = "";
367                 rAlign = "";
368         
369                 int diffs = 0;
370                 length = 0;
371
372                 while(i > 0 && j > 0){
373                         
374                         if (m->control_pressed) { return 0; }
375                         
376                         if(alignMoves[i][j] == 'd'){
377                                 qAlign = query[i-1] + qAlign;
378                                 rAlign = reference[j-1] + rAlign;
379                 
380                                 if(query[i-1] != reference[j-1]){       diffs++;        }
381                                 length++;
382                                 
383                                 i--;
384                                 j--;
385                         }
386                         else if(alignMoves[i][j] == 'u'){
387                                 qAlign = query[i-1] + qAlign;
388                                 
389                                 if(j != refLength)      {       rAlign = '-' + rAlign;  diffs++;        length++;       }
390                                 else                            {       rAlign = '.' + rAlign;  }
391                                 i--;
392                         }
393                         else if(alignMoves[i][j] == 'l'){
394                                 rAlign = reference[j-1] + rAlign;
395                                 
396                                 if(i != queryLength){   qAlign = '-' + qAlign;  diffs++;        length++;       }
397                                 else                            {       qAlign = '.' + qAlign;  }
398                                 j--;
399                         }
400                 }
401
402         if(i>0){
403             qAlign = query.substr(0, i) + qAlign;
404             rAlign = string(i, '.') + rAlign;
405         }
406                 else if(j>0){
407             qAlign = string(j, '.') + qAlign;
408             rAlign = reference.substr(0, j) + rAlign;
409         }
410
411         
412                 return diffs;
413         }
414         catch(exception& e) {
415                 m->errorOut(e, "RefChimeraTest", "alignQueryToReferences");
416                 exit(1);
417         }
418 }
419
420 /**************************************************************************************************/
421
422 int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
423         try {
424                 int alignLength = qAlign.length();
425                 
426                 int lDiffs = 0;
427                 int lCount = 0;
428                 for(int l=0;l<alignLength;l++){
429                         
430                         if (m->control_pressed) { return 0; }
431                         
432                         if(qAlign[l] == '-'){
433                                 lDiffs++;               
434                         }
435                         else if(qAlign[l] != '.'){
436                                 
437                                 if(rAlign[l] == '-'){
438                                         lDiffs++;
439                                 }
440                                 else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
441                                         lDiffs++;
442                                 }
443                                 leftDiffs[lCount] = lDiffs;
444                                 leftMap[lCount] = l;
445                                 
446                                 lCount++;
447                         }                       
448                 }
449
450                 int rDiffs = 0;
451                 int rCount = 0;
452                 for(int l=alignLength-1;l>=0;l--){
453                         
454                         if (m->control_pressed) { return 0; }
455                         
456                         if(qAlign[l] == '-'){
457                                 rDiffs++;               
458                         }
459                         else if(qAlign[l] != '.'){
460                                 
461                                 if(rAlign[l] == '-'){
462                                         rDiffs++;
463                                 }
464                                 else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
465                                         rDiffs++;
466                                 }
467                                 
468                                 rightDiffs[rCount] = rDiffs;
469                                 rightMap[rCount] = l;
470                                 rCount++;
471                         }
472                         
473                 }
474
475                 return 0;
476         }
477         catch(exception& e) {
478                 m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs");
479                 exit(1);
480         }
481 }
482
483 /**************************************************************************************************/
484
485 int RefChimeraTest::getAlignedMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
486         
487         int bestSequenceMismatch = MAXINT;
488         
489         for(int i=0;i<numRefSeqs;i++){
490                 
491                 int lDiffs = 0;
492                 
493                 for(int l=0;l<alignLength;l++){
494                         if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
495                                 lDiffs++;
496                         }
497                         left[i][l] = lDiffs;
498                 }
499                 
500                 int rDiffs = 0;
501                 int index = 0;
502                 for(int l=alignLength-1;l>=0;l--){
503                         if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
504                                 rDiffs++;
505                         }                       
506                         right[i][index++] = rDiffs;
507                 }
508                 
509                 if(lDiffs < bestSequenceMismatch){
510                         bestSequenceMismatch = lDiffs;
511                         bestRefSeq = i;
512                 }
513         }
514         return bestSequenceMismatch;
515 }
516
517 /**************************************************************************************************/
518
519 int RefChimeraTest::getChimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& rightParent, int& breakPoint, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
520         
521         singleLeft.resize(alignLength, MAXINT);
522         bestLeft.resize(alignLength, -1);
523         
524         for(int l=0;l<alignLength;l++){
525                 for(int i=0;i<numRefSeqs;i++){
526                         if(left[i][l] <= singleLeft[l]){
527                                 singleLeft[l] = left[i][l];
528                                 bestLeft[l] = i;
529                         }
530                 }
531         }
532         
533         singleRight.resize(alignLength, MAXINT);
534         bestRight.resize(alignLength, -1);
535         
536         for(int l=0;l<alignLength;l++){
537                 for(int i=0;i<numRefSeqs;i++){
538                         if(right[i][l] <= singleRight[l]){
539                                 singleRight[l] = right[i][l];
540                                 bestRight[l] = i;
541                         }
542                 }
543         }
544         
545         int bestChimeraMismatches = MAXINT;
546         leftParent = -1;
547         rightParent = -1;
548         breakPoint = -1;
549         
550         for(int l=0;l<alignLength;l++){
551                 int chimera = singleLeft[l] + singleRight[alignLength - l - 1];
552                 if(chimera < bestChimeraMismatches){
553                         bestChimeraMismatches = chimera;
554                         breakPoint = l;
555                         leftParent = bestLeft[l];
556                         rightParent = bestRight[alignLength - l - 1];
557                 }
558         }
559         
560         return bestChimeraMismatches;
561 }
562
563 /**************************************************************************************************/
564
565 int RefChimeraTest::getTrimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& middleParent, int& rightParent, int& breakPointA, int& breakPointB, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
566         
567         int bestTrimeraMismatches = MAXINT;
568         
569         leftParent = -1;
570         middleParent = -1;
571         rightParent = -1;
572         
573         breakPointA = -1;
574         breakPointB = -1;
575         
576         vector<vector<int> > minDelta(alignLength);
577         vector<vector<int> > minDeltaSeq(alignLength);
578         
579         for(int i=0;i<alignLength;i++){
580                 minDelta[i].assign(alignLength, MAXINT);
581                 minDeltaSeq[i].assign(alignLength, -1);
582         }
583         
584         for(int x=0;x<alignLength;x++){
585                 for(int y=x;y<alignLength-1;y++){
586                         
587                         for(int i=0;i<numRefSeqs;i++){
588                                 int delta = left[i][y] - left[i][x];
589                                 
590                                 if(delta <= minDelta[x][y]){
591                                         minDelta[x][y] = delta;
592                                         minDeltaSeq[x][y] = i;                                  
593                                 }                               
594                         }
595                         minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
596                         
597                         if(minDelta[x][y] < bestTrimeraMismatches){
598                                 bestTrimeraMismatches = minDelta[x][y];
599                                 
600                                 breakPointA = x;
601                                 breakPointB = y;
602                                 
603                                 leftParent = bestLeft[x];
604                                 middleParent = minDeltaSeq[x][y];
605                                 rightParent = bestRight[alignLength - y - 2];                           
606                         }
607                 }               
608         }
609         return bestTrimeraMismatches;
610 }
611
612 /**************************************************************************************************/
613
614 string RefChimeraTest::stitchBimera(int leftParent, int rightParent, int breakPoint){
615         
616         string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPoint) + referenceSeqs[rightParent].substr(breakPoint);
617         return chimeraRefSeq;
618         
619 }
620
621 /**************************************************************************************************/
622
623 string RefChimeraTest::stitchTrimera(int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB){
624         
625         string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPointA) + referenceSeqs[middleParent].substr(breakPointA, breakPointB-breakPointA) + referenceSeqs[rightParent].substr(breakPointB);
626         
627         return chimeraRefSeq;
628 }
629
630 /**************************************************************************************************/
631
632 double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq){
633         
634         int match = 0;
635         int mismatch = 0;
636         
637         for(int i=0;i<alignLength;i++){
638 //              if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
639                 if(chimeraRefSeq[i] != '.' && querySeq[i] != '.'){
640                         if(querySeq[i] == '-' && chimeraRefSeq[i] == '-' && chimeraRefSeq[i] != 'N'){   /*      do nothing      */      }
641                         else if(querySeq[i] == chimeraRefSeq[i]){
642                                 match++;
643                         }
644                         else{
645                                 mismatch++;
646                         }                       
647                 }
648         }
649         
650         return (double)mismatch / (double)(mismatch + match);   
651 }
652
653 //***************************************************************************************************************
654
655 int RefChimeraTest::getClosestRefIndex(){
656
657         return bestMatch;
658         
659 }
660
661 //***************************************************************************************************************
662
663 string RefChimeraTest::getQueryAlignment(){
664     
665         return bestQueryAlignment;
666         
667 }
668
669 //***************************************************************************************************************
670
671 string RefChimeraTest::getClosestRefAlignment(){
672     
673         return bestRefAlignment;
674         
675 }
676
677 //***************************************************************************************************************