]> git.donarmstrong.com Git - mothur.git/commitdiff
true chimera testing using reference sequences
authorpschloss <pschloss>
Thu, 3 Feb 2011 12:29:43 +0000 (12:29 +0000)
committerpschloss <pschloss>
Thu, 3 Feb 2011 12:29:43 +0000 (12:29 +0000)
refchimeratest.cpp [new file with mode: 0644]
refchimeratest.h [new file with mode: 0644]

diff --git a/refchimeratest.cpp b/refchimeratest.cpp
new file mode 100644 (file)
index 0000000..246d439
--- /dev/null
@@ -0,0 +1,276 @@
+/*
+ *  refchimeratest.cpp
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 1/31/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "refchimeratest.h"
+#include "mothur.h"
+
+int MAXINT = numeric_limits<int>::max();
+
+//***************************************************************************************************************
+
+RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, string chimeraReportFileName){
+
+       m = MothurOut::getInstance();
+
+       m->openOutputFile(chimeraReportFileName, chimeraReportFile);
+       numRefSeqs = refs.size();
+
+       referenceSeqs.resize(numRefSeqs);
+       referenceNames.resize(numRefSeqs);
+       for(int i=0;i<numRefSeqs;i++){
+               referenceSeqs[i] = refs[i].getAligned();
+               referenceNames[i] = refs[i].getName();
+       }
+       
+       alignLength = referenceSeqs[0].length();
+
+       chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\t";
+       chimeraReportFile << "leftParentChi,middleParentTri,rightParentChi\tbreakPointTriA,breakPointTriB\tminMismatchToTrimera\tdistToBestMera\tnMera" << endl;
+
+}
+
+//***************************************************************************************************************
+
+int RefChimeraTest::analyzeQuery(string queryName, string querySeq){
+       
+       vector<vector<int> > left(numRefSeqs);
+       vector<int> singleLeft, bestLeft;
+       vector<int> singleRight, bestRight;
+       
+       vector<vector<int> > right(numRefSeqs);
+       for(int i=0;i<numRefSeqs;i++){
+               left[i].assign(alignLength, 0);
+       }
+       right = left;
+       
+       int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch);
+       
+       int leftParentBi, rightParentBi, breakPointBi;
+       int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
+       
+       int minMismatchToTrimera = MAXINT;
+       int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
+       
+       int nMera = 0;
+       string chimeraRefSeq = "";
+       
+       if(bestSequenceMismatch - minMismatchToChimera <= 3){
+               nMera = 1;
+               chimeraRefSeq = referenceSeqs[bestMatch];
+       }
+       else {
+               
+               minMismatchToTrimera = getTrimera(left, right, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight);
+
+               if(minMismatchToChimera - minMismatchToTrimera <= 3){
+                       nMera = 2;
+                       chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
+               }
+               else{                   
+                       nMera = 3;
+                       chimeraRefSeq = stitchTrimera(leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB);
+               }
+               
+       }
+       double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq);
+       
+//     double loonIndex = calcLoonIndex(querySeq, referenceSeqs[leftParentBi], referenceSeqs[rightParentBi], breakPointBi, binMatrix);         
+       
+       chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
+       chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
+       chimeraReportFile << minMismatchToChimera << '\t';
+       
+       if(nMera == 1){
+               chimeraReportFile << "NA" << '\t' << "NA" << '\t' << "NA";
+       }
+       else{
+               chimeraReportFile << referenceNames[leftParentTri] << ',' << referenceNames[middleParentTri] << ',' << referenceNames[rightParentTri] << '\t' << breakPointTriA << ',' << breakPointTriB << '\t' << minMismatchToTrimera;       
+       }
+       
+       chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
+               
+       return nMera;
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
+       
+       int bestSequenceMismatch = MAXINT;
+       
+       for(int i=0;i<numRefSeqs;i++){
+               
+               int lDiffs = 0;
+               for(int l=0;l<alignLength;l++){
+                       if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
+                               lDiffs++;
+                       }
+                       left[i][l] = lDiffs;
+               }
+               
+               int rDiffs = 0;
+               int index = 0;
+               for(int l=alignLength-1;l>=0;l--){
+                       if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
+                               rDiffs++;
+                       }                       
+                       right[i][index++] = rDiffs;
+               }
+               if(lDiffs < bestSequenceMismatch){
+                       bestSequenceMismatch = lDiffs;
+                       bestRefSeq = i;
+               }
+       }
+       return bestSequenceMismatch;
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getChimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& rightParent, int& breakPoint, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
+       
+       singleLeft.resize(alignLength, MAXINT);
+       bestLeft.resize(alignLength, -1);
+       
+       for(int l=0;l<alignLength;l++){
+               for(int i=0;i<numRefSeqs;i++){
+                       if(left[i][l] <= singleLeft[l]){
+                               singleLeft[l] = left[i][l];
+                               bestLeft[l] = i;
+                       }
+               }
+       }
+       
+       singleRight.resize(alignLength, MAXINT);
+       bestRight.resize(alignLength, -1);
+       
+       for(int l=0;l<alignLength;l++){
+               for(int i=0;i<numRefSeqs;i++){
+                       if(right[i][l] <= singleRight[l]){
+                               singleRight[l] = right[i][l];
+                               bestRight[l] = i;
+                       }
+               }
+       }
+       
+       int bestChimeraMismatches = MAXINT;
+       leftParent = -1;
+       rightParent = -1;
+       breakPoint = -1;
+       
+       for(int l=0;l<alignLength;l++){
+               int chimera = singleLeft[l] + singleRight[alignLength - l - 1];
+               if(chimera < bestChimeraMismatches){
+                       bestChimeraMismatches = chimera;
+                       breakPoint = l;
+                       leftParent = bestLeft[l];
+                       rightParent = bestRight[alignLength - l - 1];
+               }
+       }
+       
+       return bestChimeraMismatches;
+}
+
+/**************************************************************************************************/
+
+int RefChimeraTest::getTrimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& middleParent, int& rightParent, int& breakPointA, int& breakPointB, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
+       
+       int bestTrimeraMismatches = MAXINT;
+       
+       leftParent = -1;
+       middleParent = -1;
+       rightParent = -1;
+       
+       breakPointA = -1;
+       breakPointB = -1;
+       
+       vector<vector<int> > minDelta(alignLength);
+       vector<vector<int> > minDeltaSeq(alignLength);
+       
+       for(int i=0;i<alignLength;i++){
+               minDelta[i].assign(alignLength, MAXINT);
+               minDeltaSeq[i].assign(alignLength, -1);
+       }
+       
+       for(int x=0;x<alignLength;x++){
+               for(int y=x;y<alignLength-1;y++){
+                       
+                       for(int i=0;i<numRefSeqs;i++){
+                               int delta = left[i][y] - left[i][x];
+                               
+                               if(delta <= minDelta[x][y]){
+                                       minDelta[x][y] = delta;
+                                       minDeltaSeq[x][y] = i;                                  
+                               }                               
+                       }
+                       minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
+                       
+                       if(minDelta[x][y] < bestTrimeraMismatches){
+                               bestTrimeraMismatches = minDelta[x][y];
+                               
+                               breakPointA = x;
+                               breakPointB = y;
+                               
+                               leftParent = bestLeft[x];
+                               middleParent = minDeltaSeq[x][y];
+                               rightParent = bestRight[alignLength - y - 2];                           
+                       }
+               }               
+       }
+       return bestTrimeraMismatches;
+}
+
+/**************************************************************************************************/
+
+string RefChimeraTest::stitchBimera(int leftParent, int rightParent, int breakPoint){
+       
+       string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPoint) + referenceSeqs[rightParent].substr(breakPoint);
+       return chimeraRefSeq;
+       
+}
+
+/**************************************************************************************************/
+
+string RefChimeraTest::stitchTrimera(int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB){
+       
+       string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPointA) + referenceSeqs[middleParent].substr(breakPointA, breakPointB-breakPointA) + referenceSeqs[rightParent].substr(breakPointB);
+       
+       return chimeraRefSeq;
+}
+
+/**************************************************************************************************/
+
+double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq){
+       
+       int match = 0;
+       int mismatch = 0;
+       
+       for(int i=0;i<alignLength;i++){
+               if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
+                       if(querySeq[i] == '-' && chimeraRefSeq[i] == '-'){      /*      do nothing      */      }
+                       else if(querySeq[i] == chimeraRefSeq[i]){
+                               match++;
+                       }
+                       else{
+                               mismatch++;
+                       }                       
+               }
+       }
+       
+       return (double)mismatch / (double)(mismatch + match);   
+}
+
+//***************************************************************************************************************
+
+int RefChimeraTest::getClosestRefIndex(){
+
+       return bestMatch;
+       
+}
+
+//***************************************************************************************************************
diff --git a/refchimeratest.h b/refchimeratest.h
new file mode 100644 (file)
index 0000000..6b648fb
--- /dev/null
@@ -0,0 +1,40 @@
+#ifndef REFCHIMERATEST
+#define REFCHIMERATEST
+
+/*
+ *  refchimeratest.h
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 1/31/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "sequence.hpp"
+
+
+class RefChimeraTest {
+       
+public:
+       RefChimeraTest(vector<Sequence>&, string);
+       int analyzeQuery(string, string);
+       int getClosestRefIndex();
+private:
+       int getMismatches(string&, vector<vector<int> >&, vector<vector<int> >&, int&);
+       int getChimera(vector<vector<int> >&, vector<vector<int> >&, int&, int&, int&, vector<int>&, vector<int>&, vector<int>&, vector<int>&);
+       int getTrimera(vector<vector<int> >&, vector<vector<int> >&, int&, int&, int&, int&, int&, vector<int>&, vector<int>&, vector<int>&, vector<int>&);
+       string stitchBimera(int, int, int);
+       string stitchTrimera(int, int, int, int, int);
+       double calcDistToChimera(string&, string&);
+
+       vector<string> referenceSeqs;
+       vector<string> referenceNames;
+       int numRefSeqs;
+       int alignLength;
+       int bestMatch;
+       ofstream chimeraReportFile;
+       
+       MothurOut* m;
+};
+
+#endif