]> git.donarmstrong.com Git - mothur.git/blob - refchimeratest.cpp
mods to seq.errror
[mothur.git] / refchimeratest.cpp
1 /*
2  *  refchimeratest.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 1/31/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "refchimeratest.h"
11 #include "mothur.h"
12
13 int MAXINT = numeric_limits<int>::max();
14
15 //***************************************************************************************************************
16
17 RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, string chimeraReportFileName){
18
19         m = MothurOut::getInstance();
20
21         m->openOutputFile(chimeraReportFileName, chimeraReportFile);
22         numRefSeqs = refs.size();
23
24         referenceSeqs.resize(numRefSeqs);
25         referenceNames.resize(numRefSeqs);
26         for(int i=0;i<numRefSeqs;i++){
27                 referenceSeqs[i] = refs[i].getAligned();
28                 referenceNames[i] = refs[i].getName();
29         }
30         
31         alignLength = referenceSeqs[0].length();
32
33         chimeraReportFile << "queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents" << endl;
34 //      chimeraReportFile << "leftParentTri,middleParentTri,rightParentTri\tbreakPointTriA,breakPointTriB\tminMismatchToTrimera\tdistToBestMera\tnMera" << endl;
35
36 }
37
38 //***************************************************************************************************************
39
40 int RefChimeraTest::analyzeQuery(string queryName, string querySeq){
41         
42         vector<vector<int> > left(numRefSeqs);
43         vector<int> singleLeft, bestLeft;
44         vector<int> singleRight, bestRight;
45         
46         vector<vector<int> > right(numRefSeqs);
47         for(int i=0;i<numRefSeqs;i++){
48                 left[i].assign(alignLength, 0);
49         }
50         right = left;
51         
52         int bestSequenceMismatch = getMismatches(querySeq, left, right, bestMatch);
53         
54         int leftParentBi, rightParentBi, breakPointBi;
55         int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
56         
57 //      int minMismatchToTrimera = MAXINT;
58 //      int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
59         
60         int nMera = 0;
61         string chimeraRefSeq = "";
62         
63         if(bestSequenceMismatch - minMismatchToChimera <= 3){
64                 nMera = 1;
65                 chimeraRefSeq = referenceSeqs[bestMatch];
66         }
67         else {
68
69                 nMera = 2;
70                 chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
71                 
72 //              minMismatchToTrimera = getTrimera(left, right, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight);
73 //
74 //              if(minMismatchToChimera - minMismatchToTrimera <= 3){
75 //                      nMera = 2;
76 //                      chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
77 //              }
78 //              else{                   
79 //                      nMera = 3;
80 //                      chimeraRefSeq = stitchTrimera(leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB);
81 //              }
82                 
83         }
84         double distToChimera = calcDistToChimera(querySeq, chimeraRefSeq);
85         
86 //      double loonIndex = calcLoonIndex(querySeq, referenceSeqs[leftParentBi], referenceSeqs[rightParentBi], breakPointBi, binMatrix);         
87         
88         chimeraReportFile << queryName << '\t' << referenceNames[bestMatch] << '\t' << bestSequenceMismatch << '\t';
89         chimeraReportFile << referenceNames[leftParentBi] << ',' << referenceNames[rightParentBi] << '\t' << breakPointBi << '\t';
90         chimeraReportFile << minMismatchToChimera << '\t';
91         
92 //      if(nMera == 1){
93 //              chimeraReportFile << "NA" << '\t' << "NA" << '\t' << "NA";
94 //      }
95 //      else{
96 //              chimeraReportFile << referenceNames[leftParentTri] << ',' << referenceNames[middleParentTri] << ',' << referenceNames[rightParentTri] << '\t' << breakPointTriA << ',' << breakPointTriB << '\t' << minMismatchToTrimera;       
97 //      }
98         
99         chimeraReportFile << '\t' << distToChimera << '\t' << nMera << endl;
100                 
101         return nMera;
102 }
103
104 /**************************************************************************************************/
105
106 int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
107         
108         int bestSequenceMismatch = MAXINT;
109         
110         for(int i=0;i<numRefSeqs;i++){
111                 
112                 int lDiffs = 0;
113                 for(int l=0;l<alignLength;l++){
114 //                      if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
115                         if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){
116                                 lDiffs++;
117                         }
118                         left[i][l] = lDiffs;
119                 }
120                 
121                 int rDiffs = 0;
122                 int index = 0;
123                 for(int l=alignLength-1;l>=0;l--){
124 //                      if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
125                         if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){
126                                 rDiffs++;
127                         }                       
128                         right[i][index++] = rDiffs;
129                 }
130                 if(lDiffs < bestSequenceMismatch){
131                         bestSequenceMismatch = lDiffs;
132                         bestRefSeq = i;
133                 }
134         }
135         return bestSequenceMismatch;
136 }
137
138 /**************************************************************************************************/
139
140 int RefChimeraTest::getChimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& rightParent, int& breakPoint, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
141         
142         singleLeft.resize(alignLength, MAXINT);
143         bestLeft.resize(alignLength, -1);
144         
145         for(int l=0;l<alignLength;l++){
146                 for(int i=0;i<numRefSeqs;i++){
147                         if(left[i][l] <= singleLeft[l]){
148                                 singleLeft[l] = left[i][l];
149                                 bestLeft[l] = i;
150                         }
151                 }
152         }
153         
154         singleRight.resize(alignLength, MAXINT);
155         bestRight.resize(alignLength, -1);
156         
157         for(int l=0;l<alignLength;l++){
158                 for(int i=0;i<numRefSeqs;i++){
159                         if(right[i][l] <= singleRight[l]){
160                                 singleRight[l] = right[i][l];
161                                 bestRight[l] = i;
162                         }
163                 }
164         }
165         
166         int bestChimeraMismatches = MAXINT;
167         leftParent = -1;
168         rightParent = -1;
169         breakPoint = -1;
170         
171         for(int l=0;l<alignLength;l++){
172                 int chimera = singleLeft[l] + singleRight[alignLength - l - 1];
173                 if(chimera < bestChimeraMismatches){
174                         bestChimeraMismatches = chimera;
175                         breakPoint = l;
176                         leftParent = bestLeft[l];
177                         rightParent = bestRight[alignLength - l - 1];
178                 }
179         }
180         
181         return bestChimeraMismatches;
182 }
183
184 /**************************************************************************************************/
185
186 int RefChimeraTest::getTrimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& middleParent, int& rightParent, int& breakPointA, int& breakPointB, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
187         
188         int bestTrimeraMismatches = MAXINT;
189         
190         leftParent = -1;
191         middleParent = -1;
192         rightParent = -1;
193         
194         breakPointA = -1;
195         breakPointB = -1;
196         
197         vector<vector<int> > minDelta(alignLength);
198         vector<vector<int> > minDeltaSeq(alignLength);
199         
200         for(int i=0;i<alignLength;i++){
201                 minDelta[i].assign(alignLength, MAXINT);
202                 minDeltaSeq[i].assign(alignLength, -1);
203         }
204         
205         for(int x=0;x<alignLength;x++){
206                 for(int y=x;y<alignLength-1;y++){
207                         
208                         for(int i=0;i<numRefSeqs;i++){
209                                 int delta = left[i][y] - left[i][x];
210                                 
211                                 if(delta <= minDelta[x][y]){
212                                         minDelta[x][y] = delta;
213                                         minDeltaSeq[x][y] = i;                                  
214                                 }                               
215                         }
216                         minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
217                         
218                         if(minDelta[x][y] < bestTrimeraMismatches){
219                                 bestTrimeraMismatches = minDelta[x][y];
220                                 
221                                 breakPointA = x;
222                                 breakPointB = y;
223                                 
224                                 leftParent = bestLeft[x];
225                                 middleParent = minDeltaSeq[x][y];
226                                 rightParent = bestRight[alignLength - y - 2];                           
227                         }
228                 }               
229         }
230         return bestTrimeraMismatches;
231 }
232
233 /**************************************************************************************************/
234
235 string RefChimeraTest::stitchBimera(int leftParent, int rightParent, int breakPoint){
236         
237         string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPoint) + referenceSeqs[rightParent].substr(breakPoint);
238         return chimeraRefSeq;
239         
240 }
241
242 /**************************************************************************************************/
243
244 string RefChimeraTest::stitchTrimera(int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB){
245         
246         string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPointA) + referenceSeqs[middleParent].substr(breakPointA, breakPointB-breakPointA) + referenceSeqs[rightParent].substr(breakPointB);
247         
248         return chimeraRefSeq;
249 }
250
251 /**************************************************************************************************/
252
253 double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq){
254         
255         int match = 0;
256         int mismatch = 0;
257         
258         for(int i=0;i<alignLength;i++){
259 //              if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
260                 if(chimeraRefSeq[i] != '.'){
261                         if(querySeq[i] == '-' && chimeraRefSeq[i] == '-'){      /*      do nothing      */      }
262                         else if(querySeq[i] == chimeraRefSeq[i]){
263                                 match++;
264                         }
265                         else{
266                                 mismatch++;
267                         }                       
268                 }
269         }
270         
271         return (double)mismatch / (double)(mismatch + match);   
272 }
273
274 //***************************************************************************************************************
275
276 int RefChimeraTest::getClosestRefIndex(){
277
278         return bestMatch;
279         
280 }
281
282 //***************************************************************************************************************