]> git.donarmstrong.com Git - mothur.git/blob - refchimeratest.cpp
true chimera testing using reference sequences
[mothur.git] / refchimeratest.cpp
1 /*
2  *  refchimeratest.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 1/31/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "refchimeratest.h"
11 #include "mothur.h"
12
13 int MAXINT = numeric_limits<int>::max();
14
15 //***************************************************************************************************************
16
17 RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, string chimeraReportFileName){
18
19         m = MothurOut::getInstance();
20
21         m->openOutputFile(chimeraReportFileName, chimeraReportFile);
22         numRefSeqs = refs.size();
23
24         referenceSeqs.resize(numRefSeqs);
25         referenceNames.resize(numRefSeqs);
26         for(int i=0;i<numRefSeqs;i++){
27                 referenceSeqs[i] = refs[i].getAligned();
28                 referenceNames[i] = refs[i].getName();
29         }
30         
31         alignLength = referenceSeqs[0].length();
32
33         chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\t";
34         chimeraReportFile << "leftParentChi,middleParentTri,rightParentChi\tbreakPointTriA,breakPointTriB\tminMismatchToTrimera\tdistToBestMera\tnMera" << endl;
35
36 }
37
38 //***************************************************************************************************************
39
40 int RefChimeraTest::analyzeQuery(string queryName, string querySeq){
41         
42         vector<vector<int> > left(numRefSeqs);
43         vector<int> singleLeft, bestLeft;
44         vector<int> singleRight, bestRight;
45         
46         vector<vector<int> > right(numRefSeqs);
47         for(int i=0;i<numRefSeqs;i++){
48                 left[i].assign(alignLength, 0);
49         }
50         right = left;
51         
52         int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch);
53         
54         int leftParentBi, rightParentBi, breakPointBi;
55         int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
56         
57         int minMismatchToTrimera = MAXINT;
58         int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
59         
60         int nMera = 0;
61         string chimeraRefSeq = "";
62         
63         if(bestSequenceMismatch - minMismatchToChimera <= 3){
64                 nMera = 1;
65                 chimeraRefSeq = referenceSeqs[bestMatch];
66         }
67         else {
68                 
69                 minMismatchToTrimera = getTrimera(left, right, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight);
70
71                 if(minMismatchToChimera - minMismatchToTrimera <= 3){
72                         nMera = 2;
73                         chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
74                 }
75                 else{                   
76                         nMera = 3;
77                         chimeraRefSeq = stitchTrimera(leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB);
78                 }
79                 
80         }
81         double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq);
82         
83 //      double loonIndex = calcLoonIndex(querySeq, referenceSeqs[leftParentBi], referenceSeqs[rightParentBi], breakPointBi, binMatrix);         
84         
85         chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
86         chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
87         chimeraReportFile << minMismatchToChimera << '\t';
88         
89         if(nMera == 1){
90                 chimeraReportFile << "NA" << '\t' << "NA" << '\t' << "NA";
91         }
92         else{
93                 chimeraReportFile << referenceNames[leftParentTri] << ',' << referenceNames[middleParentTri] << ',' << referenceNames[rightParentTri] << '\t' << breakPointTriA << ',' << breakPointTriB << '\t' << minMismatchToTrimera;       
94         }
95         
96         chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
97                 
98         return nMera;
99 }
100
101 /**************************************************************************************************/
102
103 int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
104         
105         int bestSequenceMismatch = MAXINT;
106         
107         for(int i=0;i<numRefSeqs;i++){
108                 
109                 int lDiffs = 0;
110                 for(int l=0;l<alignLength;l++){
111                         if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
112                                 lDiffs++;
113                         }
114                         left[i][l] = lDiffs;
115                 }
116                 
117                 int rDiffs = 0;
118                 int index = 0;
119                 for(int l=alignLength-1;l>=0;l--){
120                         if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
121                                 rDiffs++;
122                         }                       
123                         right[i][index++] = rDiffs;
124                 }
125                 if(lDiffs < bestSequenceMismatch){
126                         bestSequenceMismatch = lDiffs;
127                         bestRefSeq = i;
128                 }
129         }
130         return bestSequenceMismatch;
131 }
132
133 /**************************************************************************************************/
134
135 int RefChimeraTest::getChimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& rightParent, int& breakPoint, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
136         
137         singleLeft.resize(alignLength, MAXINT);
138         bestLeft.resize(alignLength, -1);
139         
140         for(int l=0;l<alignLength;l++){
141                 for(int i=0;i<numRefSeqs;i++){
142                         if(left[i][l] <= singleLeft[l]){
143                                 singleLeft[l] = left[i][l];
144                                 bestLeft[l] = i;
145                         }
146                 }
147         }
148         
149         singleRight.resize(alignLength, MAXINT);
150         bestRight.resize(alignLength, -1);
151         
152         for(int l=0;l<alignLength;l++){
153                 for(int i=0;i<numRefSeqs;i++){
154                         if(right[i][l] <= singleRight[l]){
155                                 singleRight[l] = right[i][l];
156                                 bestRight[l] = i;
157                         }
158                 }
159         }
160         
161         int bestChimeraMismatches = MAXINT;
162         leftParent = -1;
163         rightParent = -1;
164         breakPoint = -1;
165         
166         for(int l=0;l<alignLength;l++){
167                 int chimera = singleLeft[l] + singleRight[alignLength - l - 1];
168                 if(chimera < bestChimeraMismatches){
169                         bestChimeraMismatches = chimera;
170                         breakPoint = l;
171                         leftParent = bestLeft[l];
172                         rightParent = bestRight[alignLength - l - 1];
173                 }
174         }
175         
176         return bestChimeraMismatches;
177 }
178
179 /**************************************************************************************************/
180
181 int RefChimeraTest::getTrimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& middleParent, int& rightParent, int& breakPointA, int& breakPointB, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
182         
183         int bestTrimeraMismatches = MAXINT;
184         
185         leftParent = -1;
186         middleParent = -1;
187         rightParent = -1;
188         
189         breakPointA = -1;
190         breakPointB = -1;
191         
192         vector<vector<int> > minDelta(alignLength);
193         vector<vector<int> > minDeltaSeq(alignLength);
194         
195         for(int i=0;i<alignLength;i++){
196                 minDelta[i].assign(alignLength, MAXINT);
197                 minDeltaSeq[i].assign(alignLength, -1);
198         }
199         
200         for(int x=0;x<alignLength;x++){
201                 for(int y=x;y<alignLength-1;y++){
202                         
203                         for(int i=0;i<numRefSeqs;i++){
204                                 int delta = left[i][y] - left[i][x];
205                                 
206                                 if(delta <= minDelta[x][y]){
207                                         minDelta[x][y] = delta;
208                                         minDeltaSeq[x][y] = i;                                  
209                                 }                               
210                         }
211                         minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
212                         
213                         if(minDelta[x][y] < bestTrimeraMismatches){
214                                 bestTrimeraMismatches = minDelta[x][y];
215                                 
216                                 breakPointA = x;
217                                 breakPointB = y;
218                                 
219                                 leftParent = bestLeft[x];
220                                 middleParent = minDeltaSeq[x][y];
221                                 rightParent = bestRight[alignLength - y - 2];                           
222                         }
223                 }               
224         }
225         return bestTrimeraMismatches;
226 }
227
228 /**************************************************************************************************/
229
230 string RefChimeraTest::stitchBimera(int leftParent, int rightParent, int breakPoint){
231         
232         string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPoint) + referenceSeqs[rightParent].substr(breakPoint);
233         return chimeraRefSeq;
234         
235 }
236
237 /**************************************************************************************************/
238
239 string RefChimeraTest::stitchTrimera(int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB){
240         
241         string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPointA) + referenceSeqs[middleParent].substr(breakPointA, breakPointB-breakPointA) + referenceSeqs[rightParent].substr(breakPointB);
242         
243         return chimeraRefSeq;
244 }
245
246 /**************************************************************************************************/
247
248 double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq){
249         
250         int match = 0;
251         int mismatch = 0;
252         
253         for(int i=0;i<alignLength;i++){
254                 if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
255                         if(querySeq[i] == '-' && chimeraRefSeq[i] == '-'){      /*      do nothing      */      }
256                         else if(querySeq[i] == chimeraRefSeq[i]){
257                                 match++;
258                         }
259                         else{
260                                 mismatch++;
261                         }                       
262                 }
263         }
264         
265         return (double)mismatch / (double)(mismatch + match);   
266 }
267
268 //***************************************************************************************************************
269
270 int RefChimeraTest::getClosestRefIndex(){
271
272         return bestMatch;
273         
274 }
275
276 //***************************************************************************************************************