]> git.donarmstrong.com Git - mothur.git/blobdiff - refchimeratest.cpp
changing command name classify.shared to classifyrf.shared
[mothur.git] / refchimeratest.cpp
index aab17e6b50d1c966f53c7d56e05c383507237a27..c084bffd40c94ef9c14a43d05de39e768dc96bea 100644 (file)
@@ -8,36 +8,62 @@
  */
 
 #include "refchimeratest.h"
+#include "myPerseus.h"
 #include "mothur.h"
 
 int MAXINT = numeric_limits<int>::max();
 
 //***************************************************************************************************************
 
-RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, string chimeraReportFileName){
+RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, bool aligned) : aligned(aligned){
 
        m = MothurOut::getInstance();
 
-       m->openOutputFile(chimeraReportFileName, chimeraReportFile);
        numRefSeqs = refs.size();
 
        referenceSeqs.resize(numRefSeqs);
        referenceNames.resize(numRefSeqs);
        for(int i=0;i<numRefSeqs;i++){
-               referenceSeqs[i] = refs[i].getAligned();
+               if (aligned) { referenceSeqs[i] = refs[i].getAligned(); }
+        else { referenceSeqs[i] = refs[i].getUnaligned(); }
                referenceNames[i] = refs[i].getName();
        }
        
        alignLength = referenceSeqs[0].length();
 
-       chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents" << endl;
-//     chimeraReportFile << "leftParentTri,middleParentTri,rightParentTri\tbreakPointTriA,breakPointTriB\tminMismatchToTrimera\tdistToBestMera\tnMera" << endl;
+}
+//***************************************************************************************************************
 
+int RefChimeraTest::printHeader(ofstream& chimeraReportFile){
+       try {
+               chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents" << endl;
+               return 0; 
+       }catch(exception& e) {
+               m->errorOut(e, "RefChimeraTest", "execute");
+               exit(1);
+       }
 }
 
 //***************************************************************************************************************
 
-int RefChimeraTest::analyzeQuery(string queryName, string querySeq){
+int RefChimeraTest::analyzeQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
+
+    int numParents = -1;
+    
+    if(aligned){
+        numParents = analyzeAlignedQuery(queryName, querySeq, chimeraReportFile);
+    }
+    else{
+        numParents = analyzeUnalignedQuery(queryName, querySeq, chimeraReportFile);
+    }
+    
+    return numParents;
+    
+}
+//***************************************************************************************************************
+
+int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
        
        vector<vector<int> > left(numRefSeqs);
        vector<int> singleLeft, bestLeft;
@@ -49,72 +75,424 @@ int RefChimeraTest::analyzeQuery(string queryName, string querySeq){
        }
        right = left;
        
-       int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch);
+       int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatch);
+    
+    
        
        int leftParentBi, rightParentBi, breakPointBi;
        int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
        
-//     int minMismatchToTrimera = MAXINT;
-//     int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
-       
        int nMera = 0;
        string chimeraRefSeq = "";
        
-       if(bestSequenceMismatch - minMismatchToChimera > 3 || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
-       
+       if(bestSequenceMismatch - minMismatchToChimera >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
                nMera = 2;
                chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
-               
-//             minMismatchToTrimera = getTrimera(left, right, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight);
-//
-//             if(minMismatchToChimera - minMismatchToTrimera <= 3){
-//                     nMera = 2;
-//                     chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
-//             }
-//             else{                   
-//                     nMera = 3;
-//                     chimeraRefSeq = stitchTrimera(leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB);
-//             }
-               
        }
        else{
                nMera = 1;
                chimeraRefSeq = referenceSeqs[bestMatch];
        }
        
-       
-       double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq);
-       
-//     double loonIndex = calcLoonIndex(querySeq, referenceSeqs[leftParentBi], referenceSeqs[rightParentBi], breakPointBi, binMatrix);         
+       bestRefAlignment = chimeraRefSeq;
+    bestQueryAlignment = querySeq;
+    
+       double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment);
        
        chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
        chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
        chimeraReportFile << minMismatchToChimera << '\t';
-       
-//     if(nMera == 1){
-//             chimeraReportFile << "NA" << '\t' << "NA" << '\t' << "NA";
-//     }
-//     else{
-//             chimeraReportFile << referenceNames[leftParentTri] << ',' << referenceNames[middleParentTri] << ',' << referenceNames[rightParentTri] << '\t' << breakPointTriA << ',' << breakPointTriB << '\t' << minMismatchToTrimera;       
-//     }
-       
-       chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
+    chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
                
        return nMera;
 }
 
+//***************************************************************************************************************
+
+int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, ofstream& chimeraReportFile){
+       
+    int nMera = 0;
+    
+    int seqLength = querySeq.length();
+    
+    vector<string> queryAlign(numRefSeqs);
+    vector<string> refAlign(numRefSeqs);
+
+    vector<vector<int> > leftDiffs(numRefSeqs);
+    vector<vector<int> > rightDiffs(numRefSeqs);
+    vector<vector<int> > leftMaps(numRefSeqs);
+    vector<vector<int> > rightMaps(numRefSeqs);
+    
+    int bestRefIndex = -1;
+    int bestRefDiffs = numeric_limits<int>::max();
+    double bestRefLength = 0;
+    
+    for(int i=0;i<numRefSeqs;i++){
+        double length = 0;
+        double diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
+        if(diffs < bestRefDiffs){
+            bestRefDiffs = diffs;
+            bestRefLength = length;
+            bestRefIndex = i;
+        }
+    }
+
+    if(bestRefDiffs >= 3){
+        for(int i=0;i<numRefSeqs;i++){
+            leftDiffs[i].assign(seqLength, 0);
+            rightDiffs[i].assign(seqLength, 0);
+            leftMaps[i].assign(seqLength, 0);
+            rightMaps[i].assign(seqLength, 0);
+            
+            getUnalignedDiffs(queryAlign[i], refAlign[i], leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
+        }
+    
+        vector<int> singleLeft(seqLength, numeric_limits<int>::max());
+        vector<int> bestLeft(seqLength, -1);
+        
+        for(int l=0;l<seqLength;l++){
+            
+            for(int i=0;i<numRefSeqs;i++){            
+                if(leftDiffs[i][l] < singleLeft[l]){
+                    singleLeft[l] = leftDiffs[i][l];
+                    bestLeft[l] = i;
+                }
+            }
+        }
+        
+        vector<int> singleRight(seqLength, numeric_limits<int>::max());
+        vector<int> bestRight(seqLength, -1);
+        
+        for(int l=0;l<seqLength;l++){
+                
+            for(int i=0;i<numRefSeqs;i++){
+                if(rightDiffs[i][l] < singleRight[l]){
+                    singleRight[l] = rightDiffs[i][l];
+                    bestRight[l] = i;
+                }
+            }
+        }
+        
+        int bestChimeraMismatches = numeric_limits<int>::max();
+        int leftParent = 0;
+        int rightParent = 0;
+        int breakPoint = 0;
+        
+        for(int l=0;l<seqLength-1;l++){
+            
+            int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
+            if(chimera < bestChimeraMismatches){
+                bestChimeraMismatches = chimera;
+                breakPoint = l;
+                leftParent = bestLeft[l];
+                rightParent = bestRight[seqLength - l - 2];
+            }
+        }
+        
+        string reference;
+        
+        if(bestRefDiffs - bestChimeraMismatches >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
+            nMera = 2;
+            
+            int breakLeft = leftMaps[leftParent][breakPoint];
+            int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
+            
+            string left = refAlign[leftParent];
+            string right = refAlign[rightParent];
+            
+            for(int i=0;i<=breakLeft;i++){
+                
+                if (m->control_pressed) { return 0; }
+                
+                if(left[i] != '-' && left[i] != '.'){
+                    reference += left[i];
+                }
+            }
+            
+            
+            for(int i=breakRight;i<right.length();i++){
+                
+                if (m->control_pressed) { return 0; }
+                
+                if(right[i] != '-' && right[i] != '.'){
+                    reference += right[i];
+                }
+            }
+
+        }
+        else{
+            nMera = 1;
+            reference = referenceSeqs[bestRefIndex];
+        }
+
+        double alignLength;
+        double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength);
+        double finalDistance = finalDiffs / alignLength;
+
+        chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
+        chimeraReportFile << referenceNames[leftParent] << ',' << referenceNames[rightParent] << '\t' << breakPoint << '\t';
+        chimeraReportFile << bestChimeraMismatches << '\t';
+        chimeraReportFile << '\t' << finalDistance << '\t' << nMera << endl;
+    }
+    else{
+        bestQueryAlignment = queryAlign[bestRefIndex];
+        bestRefAlignment = refAlign[bestRefIndex];
+        nMera = 1;
+        
+        chimeraReportFile << queryName << '\t' << referenceNames[bestRefIndex] << '\t' << bestRefDiffs << '\t';
+        chimeraReportFile << "NA\tNA\tNA\tNA\t1" << endl;
+    } 
+    
+    bestMatch = bestRefIndex;
+    return nMera;
+}
+
 /**************************************************************************************************/
 
-int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
+double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){
+    
+    
+    try {
+               double GAP = -5;
+               double MATCH = 1;
+               double MISMATCH = -1;
+               
+               int queryLength = query.length();
+               int refLength = reference.length();
+               
+               vector<vector<double> > alignMatrix(queryLength + 1);
+               vector<vector<char> > alignMoves(queryLength + 1);
+               
+               for(int i=0;i<=queryLength;i++){
+                       if (m->control_pressed) { return 0; }
+                       alignMatrix[i].resize(refLength + 1, 0);
+                       alignMoves[i].resize(refLength + 1, 'x');
+               }
+               
+               for(int i=0;i<=queryLength;i++){
+                       if (m->control_pressed) { return 0; }
+                       alignMatrix[i][0] = 0;//GAP * i;
+                       alignMoves[i][0] = 'u';
+               }
+               
+               for(int i=0;i<=refLength;i++){
+                       if (m->control_pressed) { return 0; }
+                       alignMatrix[0][i] = 0;//GAP * i;
+                       alignMoves[0][i] = 'l';
+               }
+                       
+               for(int i=1;i<=queryLength;i++){
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       for(int j=1;j<=refLength;j++){
+                               
+                               double nogapScore;              
+                               if(query[i-1] == reference[j-1]){       nogapScore = alignMatrix[i-1][j-1] + MATCH;             }
+                               else                                                    {       nogapScore = alignMatrix[i-1][j-1] + MISMATCH;  }
+                               
+                               double leftScore;
+                               if(i == queryLength)                    {       leftScore = alignMatrix[i][j-1];                                }
+                               else                                                    {       leftScore = alignMatrix[i][j-1] + GAP;                  }
+                               
+                               
+                               double upScore;
+                               if(j == refLength)                              {       upScore = alignMatrix[i-1][j];                                  }
+                               else                                                    {       upScore = alignMatrix[i-1][j] + GAP;                    }
+                               
+                               if(nogapScore > leftScore){
+                                       if(nogapScore > upScore){
+                                               alignMoves[i][j] = 'd';
+                                               alignMatrix[i][j] = nogapScore;
+                                       }
+                                       else{
+                                               alignMoves[i][j] = 'u';
+                                               alignMatrix[i][j] = upScore;
+                                       }
+                               }
+                               else{
+                                       if(leftScore > upScore){
+                                               alignMoves[i][j] = 'l';
+                                               alignMatrix[i][j] = leftScore;
+                                       }
+                                       else{
+                                               alignMoves[i][j] = 'u';
+                                               alignMatrix[i][j] = upScore;
+                                       }
+                               }
+                       }
+               }
+                       
+               int end = refLength - 1;
+        int maxRow = 0;
+        double maxRowValue = -2147483647;
+        for(int i=0;i<queryLength;i++){
+            if(alignMatrix[i][end] > maxRowValue){
+                maxRow = i;
+                maxRowValue = alignMatrix[i][end];
+            }
+        }
+        
+        end = queryLength - 1;
+        int maxColumn = 0;
+        double maxColumnValue = -2147483647;
+
+        for(int j=0;j<refLength;j++){
+            if(alignMatrix[end][j] > maxColumnValue){
+                maxColumn = j;
+                maxColumnValue = alignMatrix[end][j];
+            }
+        }
+
+        int row = queryLength-1;
+        int column = refLength-1;
+        
+        if(maxColumn == column && maxRow == row){}     //      if the max values are the lower right corner, then we're good
+        else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){
+            for(int i=maxRow+1;i<queryLength;i++){                     //      decide whether sequence A or B needs the gaps at the end either set 
+                alignMoves[i][column] = 'u';// the pointer upwards or...
+            }
+            
+        }
+        else {
+            for(int i=maxColumn+1;i<refLength;i++){
+                alignMoves[row][i] = 'l';      //      ...to the left
+            }
+        }
+
+        int i = queryLength;
+               int j = refLength;
+
+        
+               qAlign = "";
+               rAlign = "";
+        
+               int diffs = 0;
+               length = 0;
+
+               while(i > 0 && j > 0){
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       if(alignMoves[i][j] == 'd'){
+                               qAlign = query[i-1] + qAlign;
+                               rAlign = reference[j-1] + rAlign;
+                
+                               if(query[i-1] != reference[j-1]){       diffs++;        }
+                               length++;
+                               
+                               i--;
+                               j--;
+                       }
+                       else if(alignMoves[i][j] == 'u'){
+                               qAlign = query[i-1] + qAlign;
+                               
+                               if(j != refLength)      {       rAlign = '-' + rAlign;  diffs++;        length++;       }
+                               else                            {       rAlign = '.' + rAlign;  }
+                               i--;
+                       }
+                       else if(alignMoves[i][j] == 'l'){
+                               rAlign = reference[j-1] + rAlign;
+                               
+                               if(i != queryLength){   qAlign = '-' + qAlign;  diffs++;        length++;       }
+                               else                            {       qAlign = '.' + qAlign;  }
+                               j--;
+                       }
+               }
+
+        if(i>0){
+            qAlign = query.substr(0, i) + qAlign;
+            rAlign = string(i, '.') + rAlign;
+        }
+               else if(j>0){
+            qAlign = string(j, '.') + qAlign;
+            rAlign = reference.substr(0, j) + rAlign;
+        }
+
+        
+               return diffs;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RefChimeraTest", "alignQueryToReferences");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
+       try {
+               int alignLength = qAlign.length();
+               
+               int lDiffs = 0;
+               int lCount = 0;
+               for(int l=0;l<alignLength;l++){
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       if(qAlign[l] == '-'){
+                               lDiffs++;               
+                       }
+                       else if(qAlign[l] != '.'){
+                               
+                               if(rAlign[l] == '-'){
+                                       lDiffs++;
+                               }
+                               else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
+                                       lDiffs++;
+                               }
+                               leftDiffs[lCount] = lDiffs;
+                               leftMap[lCount] = l;
+                               
+                               lCount++;
+                       }                       
+               }
+
+               int rDiffs = 0;
+               int rCount = 0;
+               for(int l=alignLength-1;l>=0;l--){
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       if(qAlign[l] == '-'){
+                               rDiffs++;               
+                       }
+                       else if(qAlign[l] != '.'){
+                               
+                               if(rAlign[l] == '-'){
+                                       rDiffs++;
+                               }
+                               else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
+                                       rDiffs++;
+                               }
+                               
+                               rightDiffs[rCount] = rDiffs;
+                               rightMap[rCount] = l;
+                               rCount++;
+                       }
+                       
+               }
+
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getAlignedMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
        
        int bestSequenceMismatch = MAXINT;
        
        for(int i=0;i<numRefSeqs;i++){
                
                int lDiffs = 0;
+               
                for(int l=0;l<alignLength;l++){
-//                     if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
-                       if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){
+                       if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
                                lDiffs++;
                        }
                        left[i][l] = lDiffs;
@@ -123,12 +501,12 @@ int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left,
                int rDiffs = 0;
                int index = 0;
                for(int l=alignLength-1;l>=0;l--){
-//                     if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
-                       if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){
+                       if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
                                rDiffs++;
                        }                       
                        right[i][index++] = rDiffs;
                }
+               
                if(lDiffs < bestSequenceMismatch){
                        bestSequenceMismatch = lDiffs;
                        bestRefSeq = i;
@@ -259,8 +637,8 @@ double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq
        
        for(int i=0;i<alignLength;i++){
 //             if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
-               if(chimeraRefSeq[i] != '.'){
-                       if(querySeq[i] == '-' && chimeraRefSeq[i] == '-'){      /*      do nothing      */      }
+               if(chimeraRefSeq[i] != '.' && querySeq[i] != '.'){
+                       if(querySeq[i] == '-' && chimeraRefSeq[i] == '-' && chimeraRefSeq[i] != 'N'){   /*      do nothing      */      }
                        else if(querySeq[i] == chimeraRefSeq[i]){
                                match++;
                        }
@@ -282,3 +660,19 @@ int RefChimeraTest::getClosestRefIndex(){
 }
 
 //***************************************************************************************************************
+
+string RefChimeraTest::getQueryAlignment(){
+    
+       return bestQueryAlignment;
+       
+}
+
+//***************************************************************************************************************
+
+string RefChimeraTest::getClosestRefAlignment(){
+    
+       return bestRefAlignment;
+       
+}
+
+//***************************************************************************************************************