From: westcott Date: Tue, 16 Feb 2010 15:23:05 +0000 (+0000) Subject: chimeracode X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=9ada98592a54c82d08f3d46c9b1d8c3e472a922d chimeracode --- diff --git a/blastdb.cpp b/blastdb.cpp index 50a0401..b139f3e 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -50,7 +50,7 @@ vector BlastDB::findClosestSequences(Sequence* seq, int n) { // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too // long. With this setting, it seems comparable in speed to the suffix tree approach. - string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28 -v " + toString(n); + string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);; blastCommand += (" -i " + queryFileName + " -o " + blastFileName); system(blastCommand.c_str()); @@ -82,9 +82,9 @@ vector BlastDB::findClosestSequences(Sequence* seq, int n) { } /**************************************************************************************************/ //assumes you have added all the template sequences using the addSequence function and run generateDB. -map BlastDB::findClosest(Sequence* seq, int n) { +vector BlastDB::findClosestMegaBlast(Sequence* seq, int n) { try{ - map topMatches; + vector topMatches; ofstream queryFile; openOutputFile(queryFileName, queryFile); @@ -96,7 +96,7 @@ map BlastDB::findClosest(Sequence* seq, int n) { // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too // long. With this setting, it seems comparable in speed to the suffix tree approach. - string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -b " + toString(n) + " -v " + toString(n); + string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn blastCommand += (" -i " + queryFileName + " -o " + blastFileName); system(blastCommand.c_str()); @@ -106,22 +106,18 @@ map BlastDB::findClosest(Sequence* seq, int n) { string dummy; int templateAccession; gobble(m8FileHandle); -//string name = seq->getName(); -//ofstream out; -//openOutputFileAppend(name, out); + while(!m8FileHandle.eof()){ m8FileHandle >> dummy >> templateAccession >> searchScore; -//out << dummy << '\t' << templateAccession << '\t' << searchScore << endl; + //get rest of junk in line - while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); - //out << c; - if (c == 10 || c == 13){ break; } } + while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } } gobble(m8FileHandle); - topMatches[templateAccession] = searchScore; + topMatches.push_back(templateAccession); } m8FileHandle.close(); -//out.close(); + return topMatches; } catch(exception& e) { diff --git a/blastdb.hpp b/blastdb.hpp index fea55c8..0f8fccd 100644 --- a/blastdb.hpp +++ b/blastdb.hpp @@ -23,7 +23,7 @@ public: void generateDB(); void addSequence(Sequence); vector findClosestSequences(Sequence*, int); - map findClosest(Sequence*, int); //template index -> searchscore + vector findClosestMegaBlast(Sequence*, int); private: string dbFileName; diff --git a/chimera.h b/chimera.h index a968a02..4d59012 100644 --- a/chimera.h +++ b/chimera.h @@ -50,23 +50,11 @@ struct results { float divR; }; /***********************************************************************/ -struct rank { - int num; - float score; - rank(int n, float s) : num(n), score(s) {} -}; -/***********************************************************************/ - struct SeqDist { Sequence* seq; float dist; }; //******************************************************************************************************************** -//sorts highest to lowest -inline bool compareMembers(rank left, rank right){ - return (left.score > right.score); -} -//******************************************************************************************************************** //sorts lowest to highest inline bool compareRegionStart(results left, results right){ return (left.nastRegionStart < right.nastRegionStart); diff --git a/chimerarealigner.cpp b/chimerarealigner.cpp index 57c3641..f45b9a8 100644 --- a/chimerarealigner.cpp +++ b/chimerarealigner.cpp @@ -29,19 +29,23 @@ void ChimeraReAligner::reAlign(Sequence* query, vector parents) { sort(parents.begin(), parents.end(), compareRegionStart); //make sure you don't cutoff beginning of query - if (parents[0].nastRegionStart > 0) { newQuery += qAligned.substr(0, parents[0].nastRegionStart+1); } + if (parents[0].nastRegionStart > 0) { newQuery += qAligned.substr(0, parents[0].nastRegionStart); } int longest = 0; //take query and break apart into pieces using breakpoints given by results of parents for (int i = 0; i < parents.size(); i++) { - int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1; + cout << parents[i].parent << '\t' << parents[i].nastRegionStart << '\t' << parents[i].nastRegionEnd << endl; + int length = parents[i].nastRegionEnd - parents[i].nastRegionStart; string q = qAligned.substr(parents[i].nastRegionStart, length); + cout << "query = " << q << endl; Sequence* queryFrag = new Sequence(query->getName(), q); queryParts.push_back(queryFrag); Sequence* parent = getSequence(parents[i].parent); string p = parent->getAligned(); + p = p.substr(parents[i].nastRegionStart, length); + cout << "parent = " << p << endl; parent->setAligned(p); parentParts.push_back(parent); @@ -62,8 +66,9 @@ void ChimeraReAligner::reAlign(Sequence* query, vector parents) { newQuery += queryParts[i]->getAligned(); } + //make sure you don't cutoff end of query - if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd-1); } + if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd); } //set query to new aligned string query->setAligned(newQuery); diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index b2c0a05..2cbd5d8 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -64,9 +64,9 @@ int ChimeraSlayer::getChimeras(Sequence* query) { vector Results = maligner->getOutput(); //realign query to parents to improve slayers detection rate - //ChimeraReAligner realigner(templateSeqs, match, misMatch); - //realigner.reAlign(query, Results); - + ChimeraReAligner realigner(templateSeqs, match, misMatch); + realigner.reAlign(query, Results); +cout << query->getName() << '\n' << query->getAligned() << endl; //if (chimeraFlag == "yes") { //get sequence that were given from maligner results diff --git a/database.hpp b/database.hpp index 7905077..7fd29b3 100644 --- a/database.hpp +++ b/database.hpp @@ -48,7 +48,7 @@ public: virtual void generateDB() = 0; virtual void addSequence(Sequence) = 0; //add sequence to search engine virtual vector findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query - virtual map findClosest(Sequence*, int){ return results; } // returns of n closest sequences to query and their search scores + virtual vector findClosestMegaBlast(Sequence*, int){return results;} virtual float getSearchScore(); virtual int getLongestBase(); virtual void readKmerDB(ifstream&){}; @@ -60,7 +60,7 @@ public: protected: int numSeqs, longest; float searchScore; - map results; + vector results; }; /**************************************************************************************************/ #endif diff --git a/maligner.cpp b/maligner.cpp index de6004d..b393194 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -32,28 +32,8 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { refSeqs = getBlastSeqs(query, numWanted); } -//ofstream out; -//string name = toString(numi+1); -//cout << name << endl; -//name = name.substr(name.find_first_of("|")+1); -//cout << name << endl; -//name = name.substr(name.find_first_of("|")+1); -//cout << name << endl; -//name = name.substr(0, name.find_last_of("|")); -//cout << name << endl; -//string filename = name + ".seqsformaligner"; -//openOutputFile(filename, out); -//for (int u = 0; u < refSeqs.size(); u++) { refSeqs[u]->printSequence(out); } -//out.close(); -//filename = name + ".fasta"; -//openOutputFile(filename, out); -//query->printSequence(out); -//out.close(); - -//for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl; } -//cout << "before = " << refSeqs.size() << endl; refSeqs = minCoverageFilter(refSeqs); -//cout << "after = " << refSeqs.size() << endl; + if (refSeqs.size() < 2) { for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } percentIdenticalQueryChimera = 0.0; @@ -61,7 +41,7 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { } int chimeraPenalty = computeChimeraPenalty(); -//cout << chimeraPenalty << endl; + //fills outputResults chimera = chimeraMaligner(chimeraPenalty, decalc); @@ -500,8 +480,7 @@ float Maligner::computePercentID(string queryAlign, string chimera) { } //*************************************************************************************************************** vector Maligner::getBlastSeqs(Sequence* q, int num) { - try { -cout << q->getName() << endl; + try { //generate blastdb Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty); for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); } @@ -509,55 +488,42 @@ cout << q->getName() << endl; database->setNumSeqs(db.size()); //get parts of query - string queryAligned = q->getAligned(); - string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence - string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence - + string queryUnAligned = q->getUnaligned(); + string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence + string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence + Sequence* queryLeft = new Sequence(q->getName(), leftQuery); Sequence* queryRight = new Sequence(q->getName(), rightQuery); - map tempIndexesRight = database->findClosest(queryRight, num); - map tempIndexesLeft = database->findClosest(queryLeft, num); - - //merge results - vector mergedResults; + vector tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1); + vector tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1); - map::iterator it; - map::iterator it2; + //merge results + map seen; + map::iterator it; - //add in right guys merging common finds - for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) { - it2 = tempIndexesLeft.find(it->first); - - if (it2 == tempIndexesLeft.end()) { //result only present in right - rank temp(it->first, it->second); - mergedResults.push_back(temp); + vector mergedResults; + for (int i = 0; i < tempIndexesLeft.size(); i++) { + //add left if you havent already + it = seen.find(tempIndexesLeft[i]); + if (it == seen.end()) { + mergedResults.push_back(tempIndexesLeft[i]); + seen[tempIndexesLeft[i]] = tempIndexesLeft[i]; + } - }else { //result present in both save best score - float bestscore; - if (it->second > it2->second) { bestscore = it->second; } - else { bestscore = it2->second; } - - rank temp(it->first, bestscore); - mergedResults.push_back(temp); - - tempIndexesLeft.erase(it2); + //add right if you havent already + it = seen.find(tempIndexesRight[i]); + if (it == seen.end()) { + mergedResults.push_back(tempIndexesRight[i]); + seen[tempIndexesRight[i]] = tempIndexesRight[i]; } } - //add in unique left guys - for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) { - rank temp(it->first, it->second); - mergedResults.push_back(temp); - } - - sort(mergedResults.begin(), mergedResults.end(), compareMembers); vector refResults; for (int i = 0; i < numWanted; i++) { - Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned()); + Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); refResults.push_back(temp); -cout << db[mergedResults[i].num]->getName() << endl; } delete queryRight;