5 * Created by westcott on 9/23/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "database.hpp"
12 #include "blastdb.hpp"
14 /***********************************************************************/
15 Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) :
16 db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {}
17 /***********************************************************************/
18 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
21 outputResults.clear();
23 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
24 query = new Sequence(q->getName(), q->getAligned());
28 if (searchMethod != "blast") {
29 //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
30 refSeqs = decalc->findClosest(query, db, numWanted);
32 refSeqs = getBlastSeqs(query, numWanted);
35 refSeqs = minCoverageFilter(refSeqs);
37 if (refSeqs.size() < 2) {
38 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
39 percentIdenticalQueryChimera = 0.0;
43 int chimeraPenalty = computeChimeraPenalty();
46 chimera = chimeraMaligner(chimeraPenalty, decalc);
51 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
56 errorOut(e, "Maligner", "getResults");
60 /***********************************************************************/
61 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
66 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
67 spotMap = decalc->trimSeqs(query, refSeqs);
69 vector<Sequence*> temp = refSeqs;
70 temp.push_back(query);
74 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
76 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
78 vector<score_struct> path = extractHighestPath(matrix);
80 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
82 if (trace.size() > 1) { chimera = "yes"; }
83 else { chimera = "no"; }
85 int traceStart = path[0].col;
86 int traceEnd = path[path.size()-1].col;
88 string queryInRange = query->getAligned();
89 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
91 string chimeraSeq = constructChimericSeq(trace, refSeqs);
93 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
96 for (int i = 0; i < trace.size(); i++) {
97 int regionStart = trace[i].col;
98 int regionEnd = trace[i].oldCol;
99 int seqIndex = trace[i].row;
103 temp.parent = refSeqs[seqIndex]->getName();
104 temp.nastRegionStart = spotMap[regionStart];
105 temp.nastRegionEnd = spotMap[regionEnd];
106 temp.regionStart = regionStart;
107 temp.regionEnd = regionEnd;
109 string parentInRange = refSeqs[seqIndex]->getAligned();
110 parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
112 temp.queryToParent = computePercentID(queryInRange, parentInRange);
113 temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
115 string queryInRegion = query->getAligned();
116 queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
118 string parentInRegion = refSeqs[seqIndex]->getAligned();
119 parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
121 temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
123 outputResults.push_back(temp);
128 catch(exception& e) {
129 errorOut(e, "Maligner", "chimeraMaligner");
133 /***********************************************************************/
134 //removes top matches that do not have minimum coverage with query.
135 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){
137 vector<Sequence*> newRefs;
139 string queryAligned = query->getAligned();
141 for (int i = 0; i < ref.size(); i++) {
143 string refAligned = ref[i]->getAligned();
149 for (int j = 0; j < queryAligned.length(); j++) {
151 if (isalpha(queryAligned[j])) {
154 if (isalpha(refAligned[j])) {
160 int coverage = ((numCovered/(float)numBases)*100);
162 //if coverage above minimum
163 if (coverage > minCoverage) {
164 newRefs.push_back(ref[i]);
170 catch(exception& e) {
171 errorOut(e, "Maligner", "minCoverageFilter");
175 /***********************************************************************/
176 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
177 int Maligner::computeChimeraPenalty() {
180 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
182 int penalty = int(numAllowable + 1) * misMatchPenalty;
187 catch(exception& e) {
188 errorOut(e, "Maligner", "computeChimeraPenalty");
192 /***********************************************************************/
193 //this is a vertical filter
194 void Maligner::verticalFilter(vector<Sequence*> seqs) {
196 vector<int> gaps; gaps.resize(query->getAligned().length(), 0);
198 string filterString = (string(query->getAligned().length(), '1'));
201 for (int i = 0; i < seqs.size(); i++) {
203 string seqAligned = seqs[i]->getAligned();
205 for (int j = 0; j < seqAligned.length(); j++) {
206 //if this spot is a gap
207 if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; }
211 //zero out spot where all sequences have blanks
212 int numColRemoved = 0;
213 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
214 if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; }
217 map<int, int> newMap;
219 for (int i = 0; i < seqs.size(); i++) {
221 string seqAligned = seqs[i]->getAligned();
222 string newAligned = "";
225 for (int j = 0; j < seqAligned.length(); j++) {
226 //if this spot is not a gap
227 if (filterString[j] == '1') {
228 newAligned += seqAligned[j];
229 newMap[count] = spotMap[j];
234 seqs[i]->setAligned(newAligned);
239 catch(exception& e) {
240 errorOut(e, "Maligner", "verticalFilter");
244 //***************************************************************************************************************
245 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
248 vector< vector<score_struct> > m; m.resize(rows);
250 for (int i = 0; i < m.size(); i++) {
251 for (int j = 0; j < cols; j++) {
253 //initialize each cell
256 temp.score = -9999999;
260 m[i].push_back(temp);
266 catch(exception& e) {
267 errorOut(e, "Maligner", "buildScoreMatrix");
271 //***************************************************************************************************************
272 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& m, vector<Sequence*> seqs, int penalty) {
275 //get matrix dimensions
276 int numCols = query->getAligned().length();
277 int numRows = seqs.size();
279 //initialize first col
280 string queryAligned = query->getAligned();
281 for (int i = 0; i < numRows; i++) {
282 string subjectAligned = seqs[i]->getAligned();
285 if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
287 }else if (queryAligned[0] == subjectAligned[0]) {
288 m[i][0].score = matchScore;
294 //fill rest of matrix
295 for (int j = 1; j < numCols; j++) { //iterate through matrix columns
297 for (int i = 0; i < numRows; i++) { //iterate through matrix rows
299 string subjectAligned = seqs[i]->getAligned();
301 int matchMisMatchScore = 0;
303 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
305 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
307 }else if (queryAligned[j] == subjectAligned[j]) {
308 matchMisMatchScore = matchScore;
309 }else if (queryAligned[j] != subjectAligned[j]) {
310 matchMisMatchScore = misMatchPenalty;
313 //compute score based on previous columns scores
314 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
316 int sumScore = matchMisMatchScore + m[prevIndex][j-1].score;
318 //you are not at yourself
319 if (prevIndex != i) { sumScore += penalty; }
320 if (sumScore < 0) { sumScore = 0; }
322 if (sumScore > m[i][j].score) {
323 m[i][j].score = sumScore;
324 m[i][j].prev = prevIndex;
331 catch(exception& e) {
332 errorOut(e, "Maligner", "fillScoreMatrix");
336 //***************************************************************************************************************
337 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > m) {
340 //get matrix dimensions
341 int numCols = query->getAligned().length();
342 int numRows = m.size();
345 //find highest score scoring matrix
346 score_struct highestStruct;
347 int highestScore = 0;
349 for (int i = 0; i < numRows; i++) {
350 for (int j = 0; j < numCols; j++) {
351 if (m[i][j].score > highestScore) {
352 highestScore = m[i][j].score;
353 highestStruct = m[i][j];
358 vector<score_struct> path;
360 int rowIndex = highestStruct.row;
361 int pos = highestStruct.col;
362 int score = highestStruct.score;
364 while (pos >= 0 && score > 0) {
365 score_struct temp = m[rowIndex][pos];
368 if (score > 0) { path.push_back(temp); }
370 rowIndex = temp.prev;
374 reverse(path.begin(), path.end());
379 catch(exception& e) {
380 errorOut(e, "Maligner", "extractHighestPath");
384 //***************************************************************************************************************
385 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
387 vector<trace_struct> trace;
389 int region_index = path[0].row;
390 int region_start = path[0].col;
392 for (int i = 1; i < path.size(); i++) {
394 int next_region_index = path[i].row;
396 if (next_region_index != region_index) {
399 int col_index = path[i].col;
401 temp.col = region_start;
402 temp.oldCol = col_index-1;
403 temp.row = region_index;
405 trace.push_back(temp);
407 region_index = path[i].row;
408 region_start = col_index;
414 temp.col = region_start;
415 temp.oldCol = path[path.size()-1].col;
416 temp.row = region_index;
417 trace.push_back(temp);
422 catch(exception& e) {
423 errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
427 //***************************************************************************************************************
428 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
432 for (int i = 0; i < trace.size(); i++) {
433 string seqAlign = seqs[trace[i].row]->getAligned();
434 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
440 catch(exception& e) {
441 errorOut(e, "Maligner", "constructChimericSeq");
445 //***************************************************************************************************************
446 float Maligner::computePercentID(string queryAlign, string chimera) {
449 if (queryAlign.length() != chimera.length()) {
450 mothurOut("Error, alignment strings are of different lengths: "); mothurOutEndLine();
451 mothurOut(toString(queryAlign.length())); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine();
452 mothurOut(toString(chimera.length())); mothurOutEndLine();
458 int numIdentical = 0;
460 for (int i = 0; i < queryAlign.length(); i++) {
461 if ((isalpha(queryAlign[i])) || (isalpha(chimera[i]))) {
463 if (queryAlign[i] == chimera[i]) {
469 if (numBases == 0) { return 0; }
471 float percentIdentical = (numIdentical/(float)numBases) * 100;
473 return percentIdentical;
476 catch(exception& e) {
477 errorOut(e, "Maligner", "computePercentID");
481 //***************************************************************************************************************
482 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
485 Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
486 for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); }
487 database->generateDB();
488 database->setNumSeqs(db.size());
491 string queryUnAligned = q->getUnaligned();
492 string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
493 string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
495 Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
496 Sequence* queryRight = new Sequence(q->getName(), rightQuery);
498 vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
499 vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
503 map<int, int>::iterator it;
505 vector<int> mergedResults;
506 for (int i = 0; i < tempIndexesLeft.size(); i++) {
507 //add left if you havent already
508 it = seen.find(tempIndexesLeft[i]);
509 if (it == seen.end()) {
510 mergedResults.push_back(tempIndexesLeft[i]);
511 seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
514 //add right if you havent already
515 it = seen.find(tempIndexesRight[i]);
516 if (it == seen.end()) {
517 mergedResults.push_back(tempIndexesRight[i]);
518 seen[tempIndexesRight[i]] = tempIndexesRight[i];
523 vector<Sequence*> refResults;
524 for (int i = 0; i < numWanted; i++) {
525 Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
526 refResults.push_back(temp);
535 catch(exception& e) {
536 errorOut(e, "Maligner", "getBlastSeqs");
541 //***************************************************************************************************************