]> git.donarmstrong.com Git - mothur.git/blob - maligner.cpp
b393194bbc3a4c89fd2ae8ea3f7f7ba67f84b927
[mothur.git] / maligner.cpp
1 /*
2  *  maligner.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/23/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "maligner.h"
11 #include "database.hpp"
12 #include "blastdb.hpp"
13
14 /***********************************************************************/
15 Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) :
16                 db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {}
17 /***********************************************************************/
18 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
19         try {
20                 
21                 outputResults.clear();
22                 
23                 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
24                 query = new Sequence(q->getName(), q->getAligned());
25                 
26                 string chimera;
27                 
28                 if (searchMethod != "blast") {
29                         //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
30                         refSeqs = decalc->findClosest(query, db, numWanted);
31                 }else{
32                         refSeqs = getBlastSeqs(query, numWanted);
33                 }
34                 
35                 refSeqs = minCoverageFilter(refSeqs);
36                 
37                 if (refSeqs.size() < 2)  { 
38                         for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
39                         percentIdenticalQueryChimera = 0.0;
40                         return "unknown"; 
41                 }
42                 
43                 int chimeraPenalty = computeChimeraPenalty();
44         
45                 //fills outputResults
46                 chimera = chimeraMaligner(chimeraPenalty, decalc);
47                 
48                                 
49                 //free memory
50                 delete query;
51                 for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
52                 
53                 return chimera;
54         }
55         catch(exception& e) {
56                 errorOut(e, "Maligner", "getResults");
57                 exit(1);
58         }
59 }
60 /***********************************************************************/
61 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
62         try {
63                 
64                 string chimera;
65                 
66                 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
67                 spotMap = decalc->trimSeqs(query, refSeqs);
68                 
69                 vector<Sequence*> temp = refSeqs;
70                 temp.push_back(query);
71                 
72                 verticalFilter(temp);
73
74                 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
75                 
76                 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
77                 
78                 vector<score_struct> path = extractHighestPath(matrix);
79                 
80                 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
81                 
82                 if (trace.size() > 1) {         chimera = "yes";        }
83                 else { chimera = "no";  }
84                 
85                 int traceStart = path[0].col;
86                 int traceEnd = path[path.size()-1].col;
87                 
88                 string queryInRange = query->getAligned();
89                 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
90         
91                 string chimeraSeq = constructChimericSeq(trace, refSeqs);
92                 
93                 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
94                 
95                 //save output results
96                 for (int i = 0; i < trace.size(); i++) {
97                         int regionStart = trace[i].col;
98                         int regionEnd = trace[i].oldCol;
99                         int seqIndex = trace[i].row;
100                         
101                         results temp;
102                         
103                         temp.parent = refSeqs[seqIndex]->getName();
104                         temp.nastRegionStart = spotMap[regionStart];
105                         temp.nastRegionEnd = spotMap[regionEnd];
106                         temp.regionStart = regionStart;
107                         temp.regionEnd = regionEnd;
108                         
109                         string parentInRange = refSeqs[seqIndex]->getAligned();
110                         parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
111                         
112                         temp.queryToParent = computePercentID(queryInRange, parentInRange);
113                         temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
114                         
115                         string queryInRegion = query->getAligned();
116                         queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
117                         
118                         string parentInRegion = refSeqs[seqIndex]->getAligned();
119                         parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
120                         
121                         temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
122                 
123                         outputResults.push_back(temp);
124                 }
125
126                 return chimera;
127         }
128         catch(exception& e) {
129                 errorOut(e, "Maligner", "chimeraMaligner");
130                 exit(1);
131         }
132 }
133 /***********************************************************************/
134 //removes top matches that do not have minimum coverage with query.
135 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){  
136         try {
137                 vector<Sequence*> newRefs;
138                 
139                 string queryAligned = query->getAligned();
140                 
141                 for (int i = 0; i < ref.size(); i++) {
142                         
143                         string refAligned = ref[i]->getAligned();
144                         
145                         int numBases = 0;
146                         int numCovered = 0;
147                         
148                         //calculate coverage
149                         for (int j = 0; j < queryAligned.length(); j++) {
150                                 
151                                 if (isalpha(queryAligned[j])) {
152                                         numBases++;
153                                         
154                                         if (isalpha(refAligned[j])) {
155                                                 numCovered++;
156                                         }
157                                 }
158                         }
159                         
160                         int coverage = ((numCovered/(float)numBases)*100);
161                         
162                         //if coverage above minimum
163                         if (coverage > minCoverage) {
164                                 newRefs.push_back(ref[i]);
165                         }
166                 }
167                 
168                 return newRefs;
169         }
170         catch(exception& e) {
171                 errorOut(e, "Maligner", "minCoverageFilter");
172                 exit(1);
173         }
174 }
175 /***********************************************************************/
176 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
177 int Maligner::computeChimeraPenalty() {
178         try {
179                 
180                 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
181         
182                 int penalty = int(numAllowable + 1) * misMatchPenalty;
183                                                                                          
184                 return penalty;
185
186         }
187         catch(exception& e) {
188                 errorOut(e, "Maligner", "computeChimeraPenalty");
189                 exit(1);
190         }
191 }
192 /***********************************************************************/
193 //this is a vertical filter
194 void Maligner::verticalFilter(vector<Sequence*> seqs) {
195         try {
196                 vector<int> gaps;       gaps.resize(query->getAligned().length(), 0);
197                 
198                 string filterString = (string(query->getAligned().length(), '1'));
199                 
200                 //for each sequence
201                 for (int i = 0; i < seqs.size(); i++) {
202                 
203                         string seqAligned = seqs[i]->getAligned();
204                         
205                         for (int j = 0; j < seqAligned.length(); j++) {
206                                 //if this spot is a gap
207                                 if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
208                         }
209                 }
210                 
211                 //zero out spot where all sequences have blanks
212                 int numColRemoved = 0;
213                 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
214                         if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
215                 }
216                 
217                 map<int, int> newMap;
218                 //for each sequence
219                 for (int i = 0; i < seqs.size(); i++) {
220                 
221                         string seqAligned = seqs[i]->getAligned();
222                         string newAligned = "";
223                         int count = 0;
224                         
225                         for (int j = 0; j < seqAligned.length(); j++) {
226                                 //if this spot is not a gap
227                                 if (filterString[j] == '1') { 
228                                         newAligned += seqAligned[j]; 
229                                         newMap[count] = spotMap[j];
230                                         count++;
231                                 }
232                         }
233                         
234                         seqs[i]->setAligned(newAligned);
235                 }
236
237                 spotMap = newMap;
238         }
239         catch(exception& e) {
240                 errorOut(e, "Maligner", "verticalFilter");
241                 exit(1);
242         }
243 }
244 //***************************************************************************************************************
245 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
246         try{
247                 
248                 vector< vector<score_struct> > m; m.resize(rows);
249                 
250                 for (int i = 0; i < m.size(); i++) {
251                         for (int j = 0; j < cols; j++) {
252                                 
253                                 //initialize each cell
254                                 score_struct temp;
255                                 temp.prev = -1;
256                                 temp.score = -9999999;
257                                 temp.col = j;
258                                 temp.row = i;
259                                 
260                                 m[i].push_back(temp);
261                         }
262                 }
263                 
264                 return m;
265         }
266         catch(exception& e) {
267                 errorOut(e, "Maligner", "buildScoreMatrix");
268                 exit(1);
269         }
270 }
271 //***************************************************************************************************************
272 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& m, vector<Sequence*> seqs, int penalty) {
273         try{
274                 
275                 //get matrix dimensions
276                 int numCols = query->getAligned().length();
277                 int numRows = seqs.size();
278                 
279                 //initialize first col
280                 string queryAligned = query->getAligned();
281                 for (int i = 0; i < numRows; i++) {
282                         string subjectAligned = seqs[i]->getAligned();
283                         
284                         //are you both gaps?
285                         if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
286                                 m[i][0].score = 0;
287                         }else if (queryAligned[0] == subjectAligned[0]) {
288                                 m[i][0].score = matchScore;
289                         }else{
290                                 m[i][0].score = 0;
291                         }
292                 }
293                 
294                 //fill rest of matrix
295                 for (int j = 1; j < numCols; j++) {  //iterate through matrix columns
296                 
297                         for (int i = 0; i < numRows; i++) {  //iterate through matrix rows
298                                 
299                                 string subjectAligned = seqs[i]->getAligned();
300                                 
301                                 int matchMisMatchScore = 0;
302                                 //are you both gaps?
303                                 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
304                                         //leave the same
305                                 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
306                                         //leave the same
307                                 }else if (queryAligned[j] == subjectAligned[j]) {
308                                         matchMisMatchScore = matchScore;
309                                 }else if (queryAligned[j] != subjectAligned[j]) {
310                                         matchMisMatchScore = misMatchPenalty;
311                                 }
312                                 
313                                 //compute score based on previous columns scores
314                                 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
315                                         
316                                         int sumScore = matchMisMatchScore + m[prevIndex][j-1].score;
317                                         
318                                         //you are not at yourself
319                                         if (prevIndex != i) {   sumScore += penalty;    }
320                                         if (sumScore < 0)       {       sumScore = 0;                   }
321                                         
322                                         if (sumScore > m[i][j].score) {
323                                                 m[i][j].score = sumScore;
324                                                 m[i][j].prev = prevIndex;
325                                         }
326                                 }
327                         }
328                 }
329                 
330         }
331         catch(exception& e) {
332                 errorOut(e, "Maligner", "fillScoreMatrix");
333                 exit(1);
334         }
335 }
336 //***************************************************************************************************************
337 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > m) {
338         try {
339         
340                 //get matrix dimensions
341                 int numCols = query->getAligned().length();
342                 int numRows = m.size();
343         
344         
345                 //find highest score scoring matrix
346                 score_struct highestStruct;
347                 int highestScore = 0;
348                 
349                 for (int i = 0; i < numRows; i++) {
350                         for (int j = 0; j < numCols; j++) {
351                                 if (m[i][j].score > highestScore) {
352                                         highestScore = m[i][j].score;
353                                         highestStruct = m[i][j];
354                                 }
355                         }
356                 }
357                                 
358                 vector<score_struct> path;
359                 
360                 int rowIndex = highestStruct.row;
361                 int pos = highestStruct.col;
362                 int score = highestStruct.score;
363                 
364                 while (pos >= 0 && score > 0) {
365                         score_struct temp = m[rowIndex][pos];
366                         score = temp.score;
367                         
368                         if (score > 0) {        path.push_back(temp);   }
369                         
370                         rowIndex = temp.prev;
371                         pos--;
372                 }
373
374                 reverse(path.begin(), path.end());
375         
376                 return path;
377                 
378         }
379         catch(exception& e) {
380                 errorOut(e, "Maligner", "extractHighestPath");
381                 exit(1);
382         }
383 }
384 //***************************************************************************************************************
385 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
386         try {
387                 vector<trace_struct> trace;
388                 
389                 int region_index = path[0].row;
390                 int region_start = path[0].col;
391         
392                 for (int i = 1; i < path.size(); i++) {
393                 
394                         int next_region_index = path[i].row;
395                         
396                         if (next_region_index != region_index) {
397                                 
398                                 // add trace region
399                                 int col_index = path[i].col;
400                                 trace_struct temp;
401                                 temp.col = region_start;
402                                 temp.oldCol = col_index-1;
403                                 temp.row = region_index;
404                                 
405                                 trace.push_back(temp);
406                                                         
407                                 region_index = path[i].row;
408                                 region_start = col_index;
409                         }
410                 }
411         
412                 // get last one
413                 trace_struct temp;
414                 temp.col = region_start;
415                 temp.oldCol = path[path.size()-1].col;
416                 temp.row = region_index;
417                 trace.push_back(temp);
418
419                 return trace;
420                 
421         }
422         catch(exception& e) {
423                 errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
424                 exit(1);
425         }
426 }
427 //***************************************************************************************************************
428 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
429         try {
430                 string chimera = "";
431                 
432                 for (int i = 0; i < trace.size(); i++) {
433                         string seqAlign = seqs[trace[i].row]->getAligned();
434                         seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
435                         chimera += seqAlign;
436                 }
437                         
438                 return chimera;
439         }
440         catch(exception& e) {
441                 errorOut(e, "Maligner", "constructChimericSeq");
442                 exit(1);
443         }
444 }
445 //***************************************************************************************************************
446 float Maligner::computePercentID(string queryAlign, string chimera) {
447         try {
448         
449                 if (queryAlign.length() != chimera.length()) {
450                         mothurOut("Error, alignment strings are of different lengths: "); mothurOutEndLine();
451                         mothurOut(toString(queryAlign.length())); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine();
452                         mothurOut(toString(chimera.length())); mothurOutEndLine();
453                         return -1.0;
454                 }
455
456         
457                 int numBases = 0;
458                 int numIdentical = 0;
459         
460                 for (int i = 0; i < queryAlign.length(); i++) {
461                         if ((isalpha(queryAlign[i])) || (isalpha(chimera[i])))  {
462                                 numBases++;             
463                                 if (queryAlign[i] == chimera[i]) {
464                                         numIdentical++;
465                                 }
466                         }
467                 }
468         
469                 if (numBases == 0) { return 0; }
470         
471                 float percentIdentical = (numIdentical/(float)numBases) * 100;
472
473                 return percentIdentical;
474                 
475         }
476         catch(exception& e) {
477                 errorOut(e, "Maligner", "computePercentID");
478                 exit(1);
479         }
480 }
481 //***************************************************************************************************************
482 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
483         try {   
484                 //generate blastdb
485                 Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
486                 for (int i = 0; i < db.size(); i++) {   database->addSequence(*db[i]);  }
487                 database->generateDB();
488                 database->setNumSeqs(db.size());
489                 
490                 //get parts of query
491                 string queryUnAligned = q->getUnaligned();
492                 string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
493                 string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
494
495                 Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
496                 Sequence* queryRight = new Sequence(q->getName(), rightQuery);
497                 
498                 vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
499                 vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
500                 
501                 //merge results         
502                 map<int, int> seen;
503                 map<int, int>::iterator it;
504                 
505                 vector<int> mergedResults;
506                 for (int i = 0; i < tempIndexesLeft.size(); i++) {
507                         //add left if you havent already
508                         it = seen.find(tempIndexesLeft[i]);
509                         if (it == seen.end()) {  
510                                 mergedResults.push_back(tempIndexesLeft[i]);
511                                 seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
512                         }
513
514                         //add right if you havent already
515                         it = seen.find(tempIndexesRight[i]);
516                         if (it == seen.end()) {  
517                                 mergedResults.push_back(tempIndexesRight[i]);
518                                 seen[tempIndexesRight[i]] = tempIndexesRight[i];
519                         }
520                 }
521                 
522                 
523                 vector<Sequence*> refResults;
524                 for (int i = 0; i < numWanted; i++) {
525                         Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
526                         refResults.push_back(temp);
527                 }
528                 
529                 delete queryRight;
530                 delete queryLeft;
531                 delete database;
532                 
533                 return refResults;
534         }
535         catch(exception& e) {
536                 errorOut(e, "Maligner", "getBlastSeqs");
537                 exit(1);
538         }
539 }
540
541 //***************************************************************************************************************
542