]> git.donarmstrong.com Git - mothur.git/blob - maligner.cpp
working on chimeras
[mothur.git] / maligner.cpp
1 /*
2  *  maligner.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/23/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "maligner.h"
11 #include "database.hpp"
12 #include "blastdb.hpp"
13
14 /***********************************************************************/
15 Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) :
16                 db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {}
17 /***********************************************************************/
18 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
19         try {
20                 
21                 outputResults.clear();
22                 
23                 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
24                 query = new Sequence(q->getName(), q->getAligned());
25                 
26                 string chimera;
27                 
28                 if (searchMethod != "blast") {
29                         //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
30                         refSeqs = decalc->findClosest(query, db, numWanted);
31                 }else{
32                         refSeqs = getBlastSeqs(query, numWanted);
33                 }
34                 
35 //ofstream out;
36 //string name = toString(numi+1); 
37 //cout << name << endl;
38 //name = name.substr(name.find_first_of("|")+1);
39 //cout << name << endl;
40 //name = name.substr(name.find_first_of("|")+1);
41 //cout << name << endl;
42 //name = name.substr(0, name.find_last_of("|"));
43 //cout << name << endl;
44 //string filename = name + ".seqsformaligner";
45 //openOutputFile(filename, out);        
46 //for (int u = 0; u < refSeqs.size(); u++) {  refSeqs[u]->printSequence(out);   }
47 //out.close();
48 //filename = name + ".fasta";
49 //openOutputFile(filename, out);        
50 //query->printSequence(out);
51 //out.close();
52
53 //for (int i = 0; i < refSeqs.size(); i++)  { cout << refSeqs[i]->getName() << endl;  } 
54 //cout << "before = " << refSeqs.size() << endl;
55                 refSeqs = minCoverageFilter(refSeqs);
56 //cout << "after = " << refSeqs.size() << endl;         
57                 if (refSeqs.size() < 2)  { 
58                         for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
59                         percentIdenticalQueryChimera = 0.0;
60                         return "unknown"; 
61                 }
62                 
63                 int chimeraPenalty = computeChimeraPenalty();
64 //cout << chimeraPenalty << endl;               
65                 //fills outputResults
66                 chimera = chimeraMaligner(chimeraPenalty, decalc);
67                 
68                                 
69                 //free memory
70                 delete query;
71                 for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
72                 
73                 return chimera;
74         }
75         catch(exception& e) {
76                 errorOut(e, "Maligner", "getResults");
77                 exit(1);
78         }
79 }
80 /***********************************************************************/
81 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
82         try {
83                 
84                 string chimera;
85                 
86                 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
87                 spotMap = decalc->trimSeqs(query, refSeqs);
88                 
89                 vector<Sequence*> temp = refSeqs;
90                 temp.push_back(query);
91                 
92                 verticalFilter(temp);
93
94                 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
95                 
96                 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
97                 
98                 vector<score_struct> path = extractHighestPath(matrix);
99                 
100                 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
101                 
102                 if (trace.size() > 1) {         chimera = "yes";        }
103                 else { chimera = "no";  }
104                 
105                 int traceStart = path[0].col;
106                 int traceEnd = path[path.size()-1].col;
107                 
108                 string queryInRange = query->getAligned();
109                 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
110         
111                 string chimeraSeq = constructChimericSeq(trace, refSeqs);
112                 
113                 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
114                 
115                 //save output results
116                 for (int i = 0; i < trace.size(); i++) {
117                         int regionStart = trace[i].col;
118                         int regionEnd = trace[i].oldCol;
119                         int seqIndex = trace[i].row;
120                         
121                         results temp;
122                         
123                         temp.parent = refSeqs[seqIndex]->getName();
124                         temp.nastRegionStart = spotMap[regionStart];
125                         temp.nastRegionEnd = spotMap[regionEnd];
126                         temp.regionStart = regionStart;
127                         temp.regionEnd = regionEnd;
128                         
129                         string parentInRange = refSeqs[seqIndex]->getAligned();
130                         parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
131                         
132                         temp.queryToParent = computePercentID(queryInRange, parentInRange);
133                         temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
134                         
135                         string queryInRegion = query->getAligned();
136                         queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
137                         
138                         string parentInRegion = refSeqs[seqIndex]->getAligned();
139                         parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
140                         
141                         temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
142                 
143                         outputResults.push_back(temp);
144                 }
145
146                 return chimera;
147         }
148         catch(exception& e) {
149                 errorOut(e, "Maligner", "chimeraMaligner");
150                 exit(1);
151         }
152 }
153 /***********************************************************************/
154 //removes top matches that do not have minimum coverage with query.
155 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){  
156         try {
157                 vector<Sequence*> newRefs;
158                 
159                 string queryAligned = query->getAligned();
160                 
161                 for (int i = 0; i < ref.size(); i++) {
162                         
163                         string refAligned = ref[i]->getAligned();
164                         
165                         int numBases = 0;
166                         int numCovered = 0;
167                         
168                         //calculate coverage
169                         for (int j = 0; j < queryAligned.length(); j++) {
170                                 
171                                 if (isalpha(queryAligned[j])) {
172                                         numBases++;
173                                         
174                                         if (isalpha(refAligned[j])) {
175                                                 numCovered++;
176                                         }
177                                 }
178                         }
179                         
180                         int coverage = ((numCovered/(float)numBases)*100);
181                         
182                         //if coverage above minimum
183                         if (coverage > minCoverage) {
184                                 newRefs.push_back(ref[i]);
185                         }
186                 }
187                 
188                 return newRefs;
189         }
190         catch(exception& e) {
191                 errorOut(e, "Maligner", "minCoverageFilter");
192                 exit(1);
193         }
194 }
195 /***********************************************************************/
196 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
197 int Maligner::computeChimeraPenalty() {
198         try {
199                 
200                 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
201         
202                 int penalty = int(numAllowable + 1) * misMatchPenalty;
203                                                                                          
204                 return penalty;
205
206         }
207         catch(exception& e) {
208                 errorOut(e, "Maligner", "computeChimeraPenalty");
209                 exit(1);
210         }
211 }
212 /***********************************************************************/
213 //this is a vertical filter
214 void Maligner::verticalFilter(vector<Sequence*> seqs) {
215         try {
216                 vector<int> gaps;       gaps.resize(query->getAligned().length(), 0);
217                 
218                 string filterString = (string(query->getAligned().length(), '1'));
219                 
220                 //for each sequence
221                 for (int i = 0; i < seqs.size(); i++) {
222                 
223                         string seqAligned = seqs[i]->getAligned();
224                         
225                         for (int j = 0; j < seqAligned.length(); j++) {
226                                 //if this spot is a gap
227                                 if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
228                         }
229                 }
230                 
231                 //zero out spot where all sequences have blanks
232                 int numColRemoved = 0;
233                 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
234                         if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
235                 }
236                 
237                 map<int, int> newMap;
238                 //for each sequence
239                 for (int i = 0; i < seqs.size(); i++) {
240                 
241                         string seqAligned = seqs[i]->getAligned();
242                         string newAligned = "";
243                         int count = 0;
244                         
245                         for (int j = 0; j < seqAligned.length(); j++) {
246                                 //if this spot is not a gap
247                                 if (filterString[j] == '1') { 
248                                         newAligned += seqAligned[j]; 
249                                         newMap[count] = spotMap[j];
250                                         count++;
251                                 }
252                         }
253                         
254                         seqs[i]->setAligned(newAligned);
255                 }
256
257                 spotMap = newMap;
258         }
259         catch(exception& e) {
260                 errorOut(e, "Maligner", "verticalFilter");
261                 exit(1);
262         }
263 }
264 //***************************************************************************************************************
265 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
266         try{
267                 
268                 vector< vector<score_struct> > m; m.resize(rows);
269                 
270                 for (int i = 0; i < m.size(); i++) {
271                         for (int j = 0; j < cols; j++) {
272                                 
273                                 //initialize each cell
274                                 score_struct temp;
275                                 temp.prev = -1;
276                                 temp.score = -9999999;
277                                 temp.col = j;
278                                 temp.row = i;
279                                 
280                                 m[i].push_back(temp);
281                         }
282                 }
283                 
284                 return m;
285         }
286         catch(exception& e) {
287                 errorOut(e, "Maligner", "buildScoreMatrix");
288                 exit(1);
289         }
290 }
291 //***************************************************************************************************************
292 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& m, vector<Sequence*> seqs, int penalty) {
293         try{
294                 
295                 //get matrix dimensions
296                 int numCols = query->getAligned().length();
297                 int numRows = seqs.size();
298                 
299                 //initialize first col
300                 string queryAligned = query->getAligned();
301                 for (int i = 0; i < numRows; i++) {
302                         string subjectAligned = seqs[i]->getAligned();
303                         
304                         //are you both gaps?
305                         if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
306                                 m[i][0].score = 0;
307                         }else if (queryAligned[0] == subjectAligned[0]) {
308                                 m[i][0].score = matchScore;
309                         }else{
310                                 m[i][0].score = 0;
311                         }
312                 }
313                 
314                 //fill rest of matrix
315                 for (int j = 1; j < numCols; j++) {  //iterate through matrix columns
316                 
317                         for (int i = 0; i < numRows; i++) {  //iterate through matrix rows
318                                 
319                                 string subjectAligned = seqs[i]->getAligned();
320                                 
321                                 int matchMisMatchScore = 0;
322                                 //are you both gaps?
323                                 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
324                                         //leave the same
325                                 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
326                                         //leave the same
327                                 }else if (queryAligned[j] == subjectAligned[j]) {
328                                         matchMisMatchScore = matchScore;
329                                 }else if (queryAligned[j] != subjectAligned[j]) {
330                                         matchMisMatchScore = misMatchPenalty;
331                                 }
332                                 
333                                 //compute score based on previous columns scores
334                                 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
335                                         
336                                         int sumScore = matchMisMatchScore + m[prevIndex][j-1].score;
337                                         
338                                         //you are not at yourself
339                                         if (prevIndex != i) {   sumScore += penalty;    }
340                                         if (sumScore < 0)       {       sumScore = 0;                   }
341                                         
342                                         if (sumScore > m[i][j].score) {
343                                                 m[i][j].score = sumScore;
344                                                 m[i][j].prev = prevIndex;
345                                         }
346                                 }
347                         }
348                 }
349                 
350         }
351         catch(exception& e) {
352                 errorOut(e, "Maligner", "fillScoreMatrix");
353                 exit(1);
354         }
355 }
356 //***************************************************************************************************************
357 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > m) {
358         try {
359         
360                 //get matrix dimensions
361                 int numCols = query->getAligned().length();
362                 int numRows = m.size();
363         
364         
365                 //find highest score scoring matrix
366                 score_struct highestStruct;
367                 int highestScore = 0;
368                 
369                 for (int i = 0; i < numRows; i++) {
370                         for (int j = 0; j < numCols; j++) {
371                                 if (m[i][j].score > highestScore) {
372                                         highestScore = m[i][j].score;
373                                         highestStruct = m[i][j];
374                                 }
375                         }
376                 }
377                                 
378                 vector<score_struct> path;
379                 
380                 int rowIndex = highestStruct.row;
381                 int pos = highestStruct.col;
382                 int score = highestStruct.score;
383                 
384                 while (pos >= 0 && score > 0) {
385                         score_struct temp = m[rowIndex][pos];
386                         score = temp.score;
387                         
388                         if (score > 0) {        path.push_back(temp);   }
389                         
390                         rowIndex = temp.prev;
391                         pos--;
392                 }
393
394                 reverse(path.begin(), path.end());
395         
396                 return path;
397                 
398         }
399         catch(exception& e) {
400                 errorOut(e, "Maligner", "extractHighestPath");
401                 exit(1);
402         }
403 }
404 //***************************************************************************************************************
405 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
406         try {
407                 vector<trace_struct> trace;
408                 
409                 int region_index = path[0].row;
410                 int region_start = path[0].col;
411         
412                 for (int i = 1; i < path.size(); i++) {
413                 
414                         int next_region_index = path[i].row;
415                         
416                         if (next_region_index != region_index) {
417                                 
418                                 // add trace region
419                                 int col_index = path[i].col;
420                                 trace_struct temp;
421                                 temp.col = region_start;
422                                 temp.oldCol = col_index-1;
423                                 temp.row = region_index;
424                                 
425                                 trace.push_back(temp);
426                                                         
427                                 region_index = path[i].row;
428                                 region_start = col_index;
429                         }
430                 }
431         
432                 // get last one
433                 trace_struct temp;
434                 temp.col = region_start;
435                 temp.oldCol = path[path.size()-1].col;
436                 temp.row = region_index;
437                 trace.push_back(temp);
438
439                 return trace;
440                 
441         }
442         catch(exception& e) {
443                 errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
444                 exit(1);
445         }
446 }
447 //***************************************************************************************************************
448 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
449         try {
450                 string chimera = "";
451                 
452                 for (int i = 0; i < trace.size(); i++) {
453                         string seqAlign = seqs[trace[i].row]->getAligned();
454                         seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
455                         chimera += seqAlign;
456                 }
457                         
458                 return chimera;
459         }
460         catch(exception& e) {
461                 errorOut(e, "Maligner", "constructChimericSeq");
462                 exit(1);
463         }
464 }
465 //***************************************************************************************************************
466 float Maligner::computePercentID(string queryAlign, string chimera) {
467         try {
468         
469                 if (queryAlign.length() != chimera.length()) {
470                         mothurOut("Error, alignment strings are of different lengths: "); mothurOutEndLine();
471                         mothurOut(toString(queryAlign.length())); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine();
472                         mothurOut(toString(chimera.length())); mothurOutEndLine();
473                         return -1.0;
474                 }
475
476         
477                 int numBases = 0;
478                 int numIdentical = 0;
479         
480                 for (int i = 0; i < queryAlign.length(); i++) {
481                         if ((isalpha(queryAlign[i])) || (isalpha(chimera[i])))  {
482                                 numBases++;             
483                                 if (queryAlign[i] == chimera[i]) {
484                                         numIdentical++;
485                                 }
486                         }
487                 }
488         
489                 if (numBases == 0) { return 0; }
490         
491                 float percentIdentical = (numIdentical/(float)numBases) * 100;
492
493                 return percentIdentical;
494                 
495         }
496         catch(exception& e) {
497                 errorOut(e, "Maligner", "computePercentID");
498                 exit(1);
499         }
500 }
501 //***************************************************************************************************************
502 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
503         try {
504 cout << q->getName() << endl;   
505                 //generate blastdb
506                 Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
507                 for (int i = 0; i < db.size(); i++) {   database->addSequence(*db[i]);  }
508                 database->generateDB();
509                 database->setNumSeqs(db.size());
510                 
511                 //get parts of query
512                 string queryAligned = q->getAligned();
513                 string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
514                 string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
515                 
516                 Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
517                 Sequence* queryRight = new Sequence(q->getName(), rightQuery);
518                 
519                 map<int, float> tempIndexesRight = database->findClosest(queryRight, num);
520                 map<int, float> tempIndexesLeft = database->findClosest(queryLeft, num);
521                 
522                 //merge results
523                 vector<rank> mergedResults;
524                 
525                 map<int, float>::iterator it;
526                 map<int, float>::iterator it2;
527                 
528                 //add in right guys merging common finds
529                 for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) {
530                         it2 = tempIndexesLeft.find(it->first);
531                         
532                         if (it2 == tempIndexesLeft.end()) {  //result only present in right
533                                 rank temp(it->first, it->second);
534                                 mergedResults.push_back(temp); 
535
536                         }else { //result present in both save best score
537                                 float bestscore;
538                                 if (it->second > it2->second)   {  bestscore = it->second;  }
539                                 else                                                    {  bestscore = it2->second; }
540                                 
541                                 rank temp(it->first, bestscore);
542                                 mergedResults.push_back(temp);
543                                 
544                                 tempIndexesLeft.erase(it2);
545                         }
546                 }
547                 
548                 //add in unique left guys
549                 for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) {
550                         rank temp(it->first, it->second);
551                         mergedResults.push_back(temp); 
552                 }
553                 
554                 sort(mergedResults.begin(), mergedResults.end(), compareMembers);
555                 
556                 vector<Sequence*> refResults;
557                 for (int i = 0; i < numWanted; i++) {
558                         Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned());
559                         refResults.push_back(temp);
560 cout << db[mergedResults[i].num]->getName() << endl;
561                 }
562                 
563                 delete queryRight;
564                 delete queryLeft;
565                 delete database;
566                 
567                 return refResults;
568         }
569         catch(exception& e) {
570                 errorOut(e, "Maligner", "getBlastSeqs");
571                 exit(1);
572         }
573 }
574
575 //***************************************************************************************************************
576