]> git.donarmstrong.com Git - mothur.git/blob - maligner.cpp
started adding chimeraslayer method and fixed minor bug in treegroupscommand deconstr...
[mothur.git] / maligner.cpp
1 /*
2  *  maligner.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/23/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "maligner.h"
11
12 /***********************************************************************/
13 Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int minCov) :
14                 db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minCoverage(minCov) {}
15 /***********************************************************************/
16 string Maligner::getResults(Sequence* q) {
17         try {
18                 
19                 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
20                 query = new Sequence(q->getName(), q->getAligned());
21                 
22                 string chimera;
23         
24                 decalc = new DeCalculator();
25                 
26                 //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
27                 refSeqs = decalc->findClosest(query, db, numWanted);
28                 
29                 ofstream out;
30                 string outFile = "parentsOf" + query->getName();
31                 openOutputFile(outFile, out);
32                 for (int i = 0; i < refSeqs.size(); i++) {   refSeqs[i]->printSequence(out);    }
33                 out.close();
34                 
35                 refSeqs = minCoverageFilter(refSeqs);
36                 
37                 if (refSeqs.size() < 2)  { 
38                         for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
39                         percentIdenticalQueryChimera = 0.0;
40                         return "unknown"; 
41                 }
42                 
43                 int chimeraPenalty = computeChimeraPenalty();
44         
45                 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
46                 decalc->trimSeqs(query, refSeqs);
47                 
48                 vector<Sequence*> temp = refSeqs;
49                 temp.push_back(query);
50                 
51                 verticalFilter(temp);
52
53                 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
54                 
55                 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
56                 
57                 vector<score_struct> path = extractHighestPath(matrix);
58                 
59                 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
60                 
61                 if (trace.size() > 1) {         chimera = "yes";        }
62                 else { chimera = "no";  }
63                 
64                 int traceStart = path[0].col;
65                 int traceEnd = path[path.size()-1].col;
66                 
67                 string queryInRange = query->getAligned();
68                 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
69         
70                 string chimeraSeq = constructChimericSeq(trace, refSeqs);
71                 
72                 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
73                 
74                 delete decalc;
75                 
76                 //save output results
77                 for (int i = 0; i < trace.size(); i++) {
78                         int regionStart = trace[i].col;
79                         int regionEnd = trace[i].oldCol;
80                         int seqIndex = trace[i].row;
81                         
82                         results temp;
83                         
84                         temp.parent = refSeqs[seqIndex]->getName();
85                         temp.regionStart = regionStart;
86                         temp.regionEnd = regionEnd;
87                         
88                         string parentInRange = refSeqs[seqIndex]->getAligned();
89                         parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
90                         
91                         temp.queryToParent = computePercentID(queryInRange, parentInRange);
92                         temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
93                         
94                         string queryInRegion = query->getAligned();
95                         queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
96                         
97                         string parentInRegion = refSeqs[seqIndex]->getAligned();
98                         parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
99                         
100                         temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
101                 
102                         outputResults.push_back(temp);
103                 }
104                 
105                 //free memory
106                 delete query;
107                 for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
108                 
109                 return chimera;
110         }
111         catch(exception& e) {
112                 errorOut(e, "Maligner", "getResults");
113                 exit(1);
114         }
115 }
116 /***********************************************************************/
117 //removes top matches that do not have minimum coverage with query.
118 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){  
119         try {
120                 vector<Sequence*> newRefs;
121                 
122                 string queryAligned = query->getAligned();
123                 
124                 for (int i = 0; i < ref.size(); i++) {
125                         
126                         string refAligned = ref[i]->getAligned();
127                         
128                         int numBases = 0;
129                         int numCovered = 0;
130                         
131                         //calculate coverage
132                         for (int j = 0; j < queryAligned.length(); j++) {
133                                 
134                                 if (isalpha(queryAligned[j])) {
135                                         numBases++;
136                                         
137                                         if (isalpha(refAligned[j])) {
138                                                 numCovered++;
139                                         }
140                                 }
141                         }
142                         
143                         int coverage = ((numCovered/(float)numBases)*100);
144                         
145                         //if coverage above minimum
146                         if (coverage > minCoverage) {
147                                 newRefs.push_back(ref[i]);
148                         }
149                 }
150                 
151                 return newRefs;
152         }
153         catch(exception& e) {
154                 errorOut(e, "Maligner", "minCoverageFilter");
155                 exit(1);
156         }
157 }
158 /***********************************************************************/
159 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
160 int Maligner::computeChimeraPenalty() {
161         try {
162                 
163                 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
164         
165                 int penalty = int(numAllowable + 1) * misMatchPenalty;
166                                                                                          
167                 return penalty;
168
169         }
170         catch(exception& e) {
171                 errorOut(e, "Maligner", "computeChimeraPenalty");
172                 exit(1);
173         }
174 }
175 /***********************************************************************/
176 //this is a vertical filter
177 void Maligner::verticalFilter(vector<Sequence*> seqs) {
178         try {
179                 vector<int> gaps;       gaps.resize(query->getAligned().length(), 0);
180                 
181                 string filterString = (string(query->getAligned().length(), '1'));
182                 
183                 //for each sequence
184                 for (int i = 0; i < seqs.size(); i++) {
185                 
186                         string seqAligned = seqs[i]->getAligned();
187                         
188                         for (int j = 0; j < seqAligned.length(); j++) {
189                                 //if this spot is a gap
190                                 if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
191                         }
192                 }
193                 
194                 //zero out spot where all sequences have blanks
195                 int numColRemoved = 0;
196                 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
197                         if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
198                 }
199                 
200                 //for each sequence
201                 for (int i = 0; i < seqs.size(); i++) {
202                 
203                         string seqAligned = seqs[i]->getAligned();
204                         string newAligned = "";
205                         
206                         for (int j = 0; j < seqAligned.length(); j++) {
207                                 //if this spot is not a gap
208                                 if (filterString[j] == '1') { newAligned += seqAligned[j]; }
209                         }
210                         
211                         seqs[i]->setAligned(newAligned);
212                 }
213
214         
215         }
216         catch(exception& e) {
217                 errorOut(e, "Maligner", "verticalFilter");
218                 exit(1);
219         }
220 }
221 //***************************************************************************************************************
222 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
223         try{
224                 
225                 vector< vector<score_struct> > m; m.resize(rows);
226                 
227                 for (int i = 0; i < m.size(); i++) {
228                         for (int j = 0; j < cols; j++) {
229                                 
230                                 //initialize each cell
231                                 score_struct temp;
232                                 temp.prev = -1;
233                                 temp.score = -9999999;
234                                 temp.col = j;
235                                 temp.row = i;
236                                 
237                                 m[i].push_back(temp);
238                         }
239                 }
240                 
241                 return m;
242         }
243         catch(exception& e) {
244                 errorOut(e, "Maligner", "buildScoreMatrix");
245                 exit(1);
246         }
247 }
248 //***************************************************************************************************************
249 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& m, vector<Sequence*> seqs, int penalty) {
250         try{
251                 
252                 //get matrix dimensions
253                 int numCols = query->getAligned().length();
254                 int numRows = seqs.size();
255                 
256                 //initialize first col
257                 string queryAligned = query->getAligned();
258                 for (int i = 0; i < numRows; i++) {
259                         string subjectAligned = seqs[i]->getAligned();
260                         
261                         //are you both gaps?
262                         if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
263                                 m[i][0].score = 0;
264                         }else if (queryAligned[0] == subjectAligned[0]) {
265                                 m[i][0].score = matchScore;
266                         }else{
267                                 m[i][0].score = 0;
268                         }
269                 }
270                 
271                 //fill rest of matrix
272                 for (int j = 1; j < numCols; j++) {  //iterate through matrix columns
273                 
274                         for (int i = 0; i < numRows; i++) {  //iterate through matrix rows
275                                 
276                                 string subjectAligned = seqs[i]->getAligned();
277                                 
278                                 int matchMisMatchScore = 0;
279                                 //are you both gaps?
280                                 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
281                                         //leave the same
282                                 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
283                                         //leave the same
284                                 }else if (queryAligned[j] == subjectAligned[j]) {
285                                         matchMisMatchScore = matchScore;
286                                 }else if (queryAligned[j] != subjectAligned[j]) {
287                                         matchMisMatchScore = misMatchPenalty;
288                                 }
289                                 
290                                 //compute score based on previous columns scores
291                                 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
292                                         
293                                         int sumScore = matchMisMatchScore + m[prevIndex][j-1].score;
294                                         
295                                         //you are not at yourself
296                                         if (prevIndex != i) {   sumScore += penalty;    }
297                                         if (sumScore < 0)       {       sumScore = 0;                   }
298                                         
299                                         if (sumScore > m[i][j].score) {
300                                                 m[i][j].score = sumScore;
301                                                 m[i][j].prev = prevIndex;
302                                         }
303                                 }
304                         }
305                 }
306                 
307         }
308         catch(exception& e) {
309                 errorOut(e, "Maligner", "fillScoreMatrix");
310                 exit(1);
311         }
312 }
313 //***************************************************************************************************************
314 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > m) {
315         try {
316         
317                 //get matrix dimensions
318                 int numCols = query->getAligned().length();
319                 int numRows = m.size();
320         
321         
322                 //find highest score scoring matrix
323                 score_struct highestStruct;
324                 int highestScore = 0;
325                 
326                 for (int i = 0; i < numRows; i++) {
327                         for (int j = 0; j < numCols; j++) {
328                                 if (m[i][j].score > highestScore) {
329                                         highestScore = m[i][j].score;
330                                         highestStruct = m[i][j];
331                                 }
332                         }
333                 }
334                                 
335                 vector<score_struct> path;
336                 
337                 int rowIndex = highestStruct.row;
338                 int pos = highestStruct.col;
339                 int score = highestStruct.score;
340                 
341                 while (pos >= 0 && score > 0) {
342                         score_struct temp = m[rowIndex][pos];
343                         score = temp.score;
344                         
345                         if (score > 0) {        path.push_back(temp);   }
346                         
347                         rowIndex = temp.prev;
348                         pos--;
349                 }
350
351                 reverse(path.begin(), path.end());
352         
353                 return path;
354                 
355         }
356         catch(exception& e) {
357                 errorOut(e, "Maligner", "extractHighestPath");
358                 exit(1);
359         }
360 }
361 //***************************************************************************************************************
362 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
363         try {
364                 vector<trace_struct> trace;
365                 
366                 int region_index = path[0].row;
367                 int region_start = path[0].col;
368         
369                 for (int i = 1; i < path.size(); i++) {
370                 
371                         int next_region_index = path[i].row;
372                         
373                         if (next_region_index != region_index) {
374                                 
375                                 // add trace region
376                                 int col_index = path[i].col;
377                                 trace_struct temp;
378                                 temp.col = region_start;
379                                 temp.oldCol = col_index-1;
380                                 temp.row = region_index;
381                                 
382                                 trace.push_back(temp);
383                                                         
384                                 region_index = path[i].row;
385                                 region_start = col_index;
386                         }
387                 }
388         
389                 // get last one
390                 trace_struct temp;
391                 temp.col = region_start;
392                 temp.oldCol = path[path.size()-1].col;
393                 temp.row = region_index;
394                 trace.push_back(temp);
395
396                 return trace;
397                 
398         }
399         catch(exception& e) {
400                 errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
401                 exit(1);
402         }
403 }
404 //***************************************************************************************************************
405 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
406         try {
407                 string chimera = "";
408                 
409                 for (int i = 0; i < trace.size(); i++) {
410                         string seqAlign = seqs[trace[i].row]->getAligned();
411                         seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
412                         chimera += seqAlign;
413                 }
414                         
415                 return chimera;
416         }
417         catch(exception& e) {
418                 errorOut(e, "Maligner", "constructChimericSeq");
419                 exit(1);
420         }
421 }
422 //***************************************************************************************************************
423 float Maligner::computePercentID(string queryAlign, string chimera) {
424         try {
425         
426                 if (queryAlign.length() != chimera.length()) {
427                         mothurOut("Error, alignment strings are of different lengths: "); mothurOutEndLine();
428                         mothurOut(toString(queryAlign.length())); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine();
429                         mothurOut(toString(chimera.length())); mothurOutEndLine();
430                         return -1.0;
431                 }
432
433         
434                 int numBases = 0;
435                 int numIdentical = 0;
436         
437                 for (int i = 0; i < queryAlign.length(); i++) {
438                         if ((isalpha(queryAlign[i])) || (isalpha(chimera[i])))  {
439                                 numBases++;             
440                                 if (queryAlign[i] == chimera[i]) {
441                                         numIdentical++;
442                                 }
443                         }
444                 }
445         
446                 if (numBases == 0) { return 0; }
447         
448                 float percentIdentical = (numIdentical/(float)numBases) * 100;
449
450                 return percentIdentical;
451                 
452         }
453         catch(exception& e) {
454                 errorOut(e, "Maligner", "computePercentID");
455                 exit(1);
456         }
457 }
458 //***************************************************************************************************************
459