5 * Created by westcott on 9/23/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "database.hpp"
12 #include "blastdb.hpp"
14 /***********************************************************************/
15 Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) :
16 db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {}
17 /***********************************************************************/
18 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
21 outputResults.clear();
23 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
24 query = new Sequence(q->getName(), q->getAligned());
28 if (searchMethod != "blast") {
29 //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
30 refSeqs = decalc->findClosest(query, db, numWanted);
32 refSeqs = getBlastSeqs(query, numWanted);
36 //string name = toString(numi+1);
37 //cout << name << endl;
38 //name = name.substr(name.find_first_of("|")+1);
39 //cout << name << endl;
40 //name = name.substr(name.find_first_of("|")+1);
41 //cout << name << endl;
42 //name = name.substr(0, name.find_last_of("|"));
43 //cout << name << endl;
44 //string filename = name + ".seqsformaligner";
45 //openOutputFile(filename, out);
46 //for (int u = 0; u < refSeqs.size(); u++) { refSeqs[u]->printSequence(out); }
48 //filename = name + ".fasta";
49 //openOutputFile(filename, out);
50 //query->printSequence(out);
53 //for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl; }
54 //cout << "before = " << refSeqs.size() << endl;
55 refSeqs = minCoverageFilter(refSeqs);
56 //cout << "after = " << refSeqs.size() << endl;
57 if (refSeqs.size() < 2) {
58 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
59 percentIdenticalQueryChimera = 0.0;
63 int chimeraPenalty = computeChimeraPenalty();
64 //cout << chimeraPenalty << endl;
66 chimera = chimeraMaligner(chimeraPenalty, decalc);
71 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
76 errorOut(e, "Maligner", "getResults");
80 /***********************************************************************/
81 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
86 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
87 spotMap = decalc->trimSeqs(query, refSeqs);
89 vector<Sequence*> temp = refSeqs;
90 temp.push_back(query);
94 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
96 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
98 vector<score_struct> path = extractHighestPath(matrix);
100 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
102 if (trace.size() > 1) { chimera = "yes"; }
103 else { chimera = "no"; }
105 int traceStart = path[0].col;
106 int traceEnd = path[path.size()-1].col;
108 string queryInRange = query->getAligned();
109 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
111 string chimeraSeq = constructChimericSeq(trace, refSeqs);
113 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
115 //save output results
116 for (int i = 0; i < trace.size(); i++) {
117 int regionStart = trace[i].col;
118 int regionEnd = trace[i].oldCol;
119 int seqIndex = trace[i].row;
123 temp.parent = refSeqs[seqIndex]->getName();
124 temp.nastRegionStart = spotMap[regionStart];
125 temp.nastRegionEnd = spotMap[regionEnd];
126 temp.regionStart = regionStart;
127 temp.regionEnd = regionEnd;
129 string parentInRange = refSeqs[seqIndex]->getAligned();
130 parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
132 temp.queryToParent = computePercentID(queryInRange, parentInRange);
133 temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
135 string queryInRegion = query->getAligned();
136 queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
138 string parentInRegion = refSeqs[seqIndex]->getAligned();
139 parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
141 temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
143 outputResults.push_back(temp);
148 catch(exception& e) {
149 errorOut(e, "Maligner", "chimeraMaligner");
153 /***********************************************************************/
154 //removes top matches that do not have minimum coverage with query.
155 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){
157 vector<Sequence*> newRefs;
159 string queryAligned = query->getAligned();
161 for (int i = 0; i < ref.size(); i++) {
163 string refAligned = ref[i]->getAligned();
169 for (int j = 0; j < queryAligned.length(); j++) {
171 if (isalpha(queryAligned[j])) {
174 if (isalpha(refAligned[j])) {
180 int coverage = ((numCovered/(float)numBases)*100);
182 //if coverage above minimum
183 if (coverage > minCoverage) {
184 newRefs.push_back(ref[i]);
190 catch(exception& e) {
191 errorOut(e, "Maligner", "minCoverageFilter");
195 /***********************************************************************/
196 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
197 int Maligner::computeChimeraPenalty() {
200 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
202 int penalty = int(numAllowable + 1) * misMatchPenalty;
207 catch(exception& e) {
208 errorOut(e, "Maligner", "computeChimeraPenalty");
212 /***********************************************************************/
213 //this is a vertical filter
214 void Maligner::verticalFilter(vector<Sequence*> seqs) {
216 vector<int> gaps; gaps.resize(query->getAligned().length(), 0);
218 string filterString = (string(query->getAligned().length(), '1'));
221 for (int i = 0; i < seqs.size(); i++) {
223 string seqAligned = seqs[i]->getAligned();
225 for (int j = 0; j < seqAligned.length(); j++) {
226 //if this spot is a gap
227 if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; }
231 //zero out spot where all sequences have blanks
232 int numColRemoved = 0;
233 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
234 if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; }
237 map<int, int> newMap;
239 for (int i = 0; i < seqs.size(); i++) {
241 string seqAligned = seqs[i]->getAligned();
242 string newAligned = "";
245 for (int j = 0; j < seqAligned.length(); j++) {
246 //if this spot is not a gap
247 if (filterString[j] == '1') {
248 newAligned += seqAligned[j];
249 newMap[count] = spotMap[j];
254 seqs[i]->setAligned(newAligned);
259 catch(exception& e) {
260 errorOut(e, "Maligner", "verticalFilter");
264 //***************************************************************************************************************
265 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
268 vector< vector<score_struct> > m; m.resize(rows);
270 for (int i = 0; i < m.size(); i++) {
271 for (int j = 0; j < cols; j++) {
273 //initialize each cell
276 temp.score = -9999999;
280 m[i].push_back(temp);
286 catch(exception& e) {
287 errorOut(e, "Maligner", "buildScoreMatrix");
291 //***************************************************************************************************************
292 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& m, vector<Sequence*> seqs, int penalty) {
295 //get matrix dimensions
296 int numCols = query->getAligned().length();
297 int numRows = seqs.size();
299 //initialize first col
300 string queryAligned = query->getAligned();
301 for (int i = 0; i < numRows; i++) {
302 string subjectAligned = seqs[i]->getAligned();
305 if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
307 }else if (queryAligned[0] == subjectAligned[0]) {
308 m[i][0].score = matchScore;
314 //fill rest of matrix
315 for (int j = 1; j < numCols; j++) { //iterate through matrix columns
317 for (int i = 0; i < numRows; i++) { //iterate through matrix rows
319 string subjectAligned = seqs[i]->getAligned();
321 int matchMisMatchScore = 0;
323 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
325 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
327 }else if (queryAligned[j] == subjectAligned[j]) {
328 matchMisMatchScore = matchScore;
329 }else if (queryAligned[j] != subjectAligned[j]) {
330 matchMisMatchScore = misMatchPenalty;
333 //compute score based on previous columns scores
334 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
336 int sumScore = matchMisMatchScore + m[prevIndex][j-1].score;
338 //you are not at yourself
339 if (prevIndex != i) { sumScore += penalty; }
340 if (sumScore < 0) { sumScore = 0; }
342 if (sumScore > m[i][j].score) {
343 m[i][j].score = sumScore;
344 m[i][j].prev = prevIndex;
351 catch(exception& e) {
352 errorOut(e, "Maligner", "fillScoreMatrix");
356 //***************************************************************************************************************
357 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > m) {
360 //get matrix dimensions
361 int numCols = query->getAligned().length();
362 int numRows = m.size();
365 //find highest score scoring matrix
366 score_struct highestStruct;
367 int highestScore = 0;
369 for (int i = 0; i < numRows; i++) {
370 for (int j = 0; j < numCols; j++) {
371 if (m[i][j].score > highestScore) {
372 highestScore = m[i][j].score;
373 highestStruct = m[i][j];
378 vector<score_struct> path;
380 int rowIndex = highestStruct.row;
381 int pos = highestStruct.col;
382 int score = highestStruct.score;
384 while (pos >= 0 && score > 0) {
385 score_struct temp = m[rowIndex][pos];
388 if (score > 0) { path.push_back(temp); }
390 rowIndex = temp.prev;
394 reverse(path.begin(), path.end());
399 catch(exception& e) {
400 errorOut(e, "Maligner", "extractHighestPath");
404 //***************************************************************************************************************
405 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
407 vector<trace_struct> trace;
409 int region_index = path[0].row;
410 int region_start = path[0].col;
412 for (int i = 1; i < path.size(); i++) {
414 int next_region_index = path[i].row;
416 if (next_region_index != region_index) {
419 int col_index = path[i].col;
421 temp.col = region_start;
422 temp.oldCol = col_index-1;
423 temp.row = region_index;
425 trace.push_back(temp);
427 region_index = path[i].row;
428 region_start = col_index;
434 temp.col = region_start;
435 temp.oldCol = path[path.size()-1].col;
436 temp.row = region_index;
437 trace.push_back(temp);
442 catch(exception& e) {
443 errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
447 //***************************************************************************************************************
448 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
452 for (int i = 0; i < trace.size(); i++) {
453 string seqAlign = seqs[trace[i].row]->getAligned();
454 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
460 catch(exception& e) {
461 errorOut(e, "Maligner", "constructChimericSeq");
465 //***************************************************************************************************************
466 float Maligner::computePercentID(string queryAlign, string chimera) {
469 if (queryAlign.length() != chimera.length()) {
470 mothurOut("Error, alignment strings are of different lengths: "); mothurOutEndLine();
471 mothurOut(toString(queryAlign.length())); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine();
472 mothurOut(toString(chimera.length())); mothurOutEndLine();
478 int numIdentical = 0;
480 for (int i = 0; i < queryAlign.length(); i++) {
481 if ((isalpha(queryAlign[i])) || (isalpha(chimera[i]))) {
483 if (queryAlign[i] == chimera[i]) {
489 if (numBases == 0) { return 0; }
491 float percentIdentical = (numIdentical/(float)numBases) * 100;
493 return percentIdentical;
496 catch(exception& e) {
497 errorOut(e, "Maligner", "computePercentID");
501 //***************************************************************************************************************
502 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
504 cout << q->getName() << endl;
506 Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
507 for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); }
508 database->generateDB();
509 database->setNumSeqs(db.size());
512 string queryAligned = q->getAligned();
513 string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
514 string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
516 Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
517 Sequence* queryRight = new Sequence(q->getName(), rightQuery);
519 map<int, float> tempIndexesRight = database->findClosest(queryRight, num);
520 map<int, float> tempIndexesLeft = database->findClosest(queryLeft, num);
523 vector<rank> mergedResults;
525 map<int, float>::iterator it;
526 map<int, float>::iterator it2;
528 //add in right guys merging common finds
529 for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) {
530 it2 = tempIndexesLeft.find(it->first);
532 if (it2 == tempIndexesLeft.end()) { //result only present in right
533 rank temp(it->first, it->second);
534 mergedResults.push_back(temp);
536 }else { //result present in both save best score
538 if (it->second > it2->second) { bestscore = it->second; }
539 else { bestscore = it2->second; }
541 rank temp(it->first, bestscore);
542 mergedResults.push_back(temp);
544 tempIndexesLeft.erase(it2);
548 //add in unique left guys
549 for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) {
550 rank temp(it->first, it->second);
551 mergedResults.push_back(temp);
554 sort(mergedResults.begin(), mergedResults.end(), compareMembers);
556 vector<Sequence*> refResults;
557 for (int i = 0; i < numWanted; i++) {
558 Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned());
559 refResults.push_back(temp);
560 cout << db[mergedResults[i].num]->getName() << endl;
569 catch(exception& e) {
570 errorOut(e, "Maligner", "getBlastSeqs");
575 //***************************************************************************************************************