From 1e4552ceb0f67f70a147d1cdca7244b62ac115ab Mon Sep 17 00:00:00 2001 From: westcott Date: Tue, 19 Apr 2011 10:08:17 +0000 Subject: [PATCH] finished chimera.slayer template=self change --- chimeraslayer.cpp | 77 ++++++++++++++++++++-------------------- chimeraslayer.h | 9 +++-- chimeraslayercommand.cpp | 60 ++++++++++++++++++------------- chimeraslayercommand.h | 2 +- 4 files changed, 78 insertions(+), 70 deletions(-) diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 8c9417a..c6613bf 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -45,7 +45,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num } } //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, int k, int ms, int mms, int win, float div, +ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map& prior, string mode, int k, int ms, int mms, int win, float div, int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() { try { fastafile = file; templateSeqs = readSeqs(fastafile); @@ -66,14 +66,16 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, s numWanted = numw; realign = r; trimChimera = trim; + priority = prior; decalc = new DeCalculator(); createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap //run filter on template - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } templateSeqs.clear(); - + for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { break; } runFilter(templateSeqs[i]); } + + } catch(exception& e) { m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer"); @@ -217,9 +219,26 @@ int ChimeraSlayer::doPrep() { } } //*************************************************************************************************************** -int ChimeraSlayer::getTemplate(Sequence* q) { +vector ChimeraSlayer::getTemplate(Sequence* q) { try { + //when template=self, the query file is sorted from most abundance to least abundant + //userTemplate grows as the query file is processed by adding sequences that are not chimeric and more abundant + vector userTemplate; + + int myAbund = priority[q->getName()]; + + for (int i = 0; i < templateSeqs.size(); i++) { + + if (m->control_pressed) { return userTemplate; } + + //have I reached a sequence with the same abundance as myself? + if (!(priority[templateSeqs[i]->getName()] > myAbund)) { break; } + + //if its am not chimeric add it + if (chimericSeqs.count(templateSeqs[i]->getName()) == 0) { userTemplate.push_back(templateSeqs[i]); } + } + string kmerDBNameLeft; string kmerDBNameRight; @@ -234,7 +253,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) { #ifdef USE_MPI for (int i = 0; i < userTemplate.size(); i++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return userTemplate; } string leftFrag = userTemplate[i]->getUnaligned(); leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); @@ -246,7 +265,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) { databaseLeft->setNumSeqs(userTemplate.size()); for (int i = 0; i < userTemplate.size(); i++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return userTemplate; } string rightFrag = userTemplate[i]->getUnaligned(); rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); @@ -262,7 +281,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) { for (int i = 0; i < userTemplate.size(); i++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return userTemplate; } string leftFrag = userTemplate[i]->getUnaligned(); leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); @@ -274,7 +293,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) { databaseLeft->setNumSeqs(userTemplate.size()); for (int i = 0; i < userTemplate.size(); i++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return userTemplate; } string rightFrag = userTemplate[i]->getUnaligned(); rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); @@ -290,12 +309,12 @@ int ChimeraSlayer::getTemplate(Sequence* q) { //generate blastdb databaseLeft = new BlastDB(-1.0, -1.0, 1, -3); - for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return 0; } databaseLeft->addSequence(*userTemplate[i]); } + for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return userTemplate; } databaseLeft->addSequence(*userTemplate[i]); } databaseLeft->generateDB(); databaseLeft->setNumSeqs(userTemplate.size()); } - return 0; + return userTemplate; } catch(exception& e) { @@ -310,12 +329,6 @@ ChimeraSlayer::~ChimeraSlayer() { if (templateFileName != "self") { if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } else if (searchMethod == "blast") { delete databaseLeft; } - }else { - //delete userTemplate - for (int i = 0; i < userTemplate.size(); i++) { - delete userTemplate[i]; - } - userTemplate.clear(); } } //*************************************************************************************************************** @@ -344,6 +357,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) { m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine(); outAcc << querySeq->getName() << endl; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + if (trimChimera) { int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart]; int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart]; @@ -364,11 +379,6 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) { out << endl; }else { out << querySeq->getName() << "\tno" << endl; - if (templateFileName == "self") { - Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned()); - runFilter(temp); - userTemplate.push_back(temp); - } } return trim; @@ -417,6 +427,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine(); outAcc << querySeq->getName() << endl; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + if (trimChimera) { string newAligned = trim->getAligned(); @@ -470,11 +482,6 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP out << endl; }else { out << querySeq->getName() << "\tno" << endl; - if (templateFileName == "self") { - Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned()); - runFilter(temp); - userTemplate.push_back(temp); - } } return trim; @@ -532,6 +539,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef outAccString += querySeq->getName() + "\n"; results = true; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + //write to accnos file int length = outAccString.length(); char* buf2 = new char[length]; @@ -610,12 +619,6 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); delete buf; - - if (template == "self") { - Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned()); - runFilter(temp); - userTemplate.push_back(temp); - } } @@ -650,6 +653,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { outAccString += querySeq->getName() + "\n"; results = true; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + //write to accnos file int length = outAccString.length(); char* buf2 = new char[length]; @@ -694,12 +699,6 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); delete buf; - - if (template == "self") { - Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned()); - runFilter(temp); - userTemplate.push_back(temp); - } } return trim; @@ -730,7 +729,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) { //you must create a template vector thisTemplate; if (templateFileName != "self") { thisTemplate = templateSeqs; } - else { getTemplate(query); thisTemplate = userTemplate; } //fills this template and creates the databases + else { thisTemplate = getTemplate(query); } //fills this template and creates the databases if (m->control_pressed) { return 0; } diff --git a/chimeraslayer.h b/chimeraslayer.h index cac96eb..ace98ce 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -23,7 +23,7 @@ class ChimeraSlayer : public Chimera { public: ChimeraSlayer(string, string, bool, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool); - ChimeraSlayer(string, string, bool, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool); + ChimeraSlayer(string, string, bool, map&, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool); ~ChimeraSlayer(); @@ -46,9 +46,8 @@ class ChimeraSlayer : public Chimera { map spotMap; Database* databaseRight; Database* databaseLeft; - vector userTemplate; //when template=self, the query file is sorted from most abundance to least abundant - //userTemplate grows as the query file is processed by adding sequences that are not chimeric - set namesOfChimericSeqs; //only used when template=self + map priority; //for template=self, seqname, seqAligned, abundance + set chimericSeqs; //for template=self, so we don't add chimeric sequences to the userTemplate set vector chimeraResults; data_results printResults; @@ -62,7 +61,7 @@ class ChimeraSlayer : public Chimera { string getBlock(data_struct, string); string getBlock(data_results, data_results, bool, bool, string); //int readNameFile(string); - int getTemplate(Sequence*); + vector getTemplate(Sequence*); }; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 1a9c1fb..f6baa0a 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -381,12 +381,12 @@ int ChimeraSlayerCommand::execute(){ //sort fastafile by abundance, returns new sorted fastafile name m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); - fastaFileNames[s] = sortFastaFile(fastaFileNames[s], nameFile); + map priority = sortFastaFile(fastaFileNames[s], nameFile); m->mothurOut("Done."); m->mothurOutEndLine(); if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } return 0; } - chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, nameFile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); + chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, priority, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); } if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it @@ -456,36 +456,45 @@ int ChimeraSlayerCommand::execute(){ MPIPos = m->setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs - //send file positions to all processes - for(int i = 1; i < processors; i++) { - MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); - MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + if (templatefile != "self") { //if template=self we can only use 1 processor + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } } - //figure out how many sequences you have to align numSeqsPerProcessor = numSeqs / processors; int startIndex = pid * numSeqsPerProcessor; if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } - + + if (templatefile == "self") { //if template=self we can only use 1 processor + startIndex = 0; + numSeqsPerProcessor = numSeqs; + } + //do your part driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos); if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } remove(outputFileName.c_str()); remove(accnosFileName.c_str()); delete chimera; return 0; } }else{ //you are a child process - MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - MPIPos.resize(numSeqs+1); - MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + if (templatefile != "self") { //if template=self we can only use 1 processor + MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + MPIPos.resize(numSeqs+1); + MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); - //figure out how many sequences you have to align - numSeqsPerProcessor = numSeqs / processors; - int startIndex = pid * numSeqsPerProcessor; - if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } + //figure out how many sequences you have to align + numSeqsPerProcessor = numSeqs / processors; + int startIndex = pid * numSeqsPerProcessor; + if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } - //do your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos); + //do your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; } + + } } //close files @@ -939,9 +948,9 @@ int ChimeraSlayerCommand::divideInHalf(Sequence querySeq, string& leftQuery, str } } /**************************************************************************************************/ - -string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) { +map ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) { try { + map nameAbund; //read through fastafile and store info map seqs; @@ -950,7 +959,7 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) { while (!in.eof()) { - if (m->control_pressed) { in.close(); return ""; } + if (m->control_pressed) { in.close(); return nameAbund; } Sequence seq(in); m->gobble(in); seqs[seq.getName()] = seq.getAligned(); @@ -962,10 +971,10 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) { vector nameMapCount; int error = m->readNames(nameFile, nameMapCount, seqs); - if (m->control_pressed) { return ""; } + if (m->control_pressed) { return nameAbund; } - if (error == 1) { m->control_pressed = true; return ""; } - if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return ""; } + if (error == 1) { m->control_pressed = true; return nameAbund; } + if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return nameAbund; } sort(nameMapCount.begin(), nameMapCount.end(), compareSeqPriorityNodes); @@ -976,12 +985,13 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) { //print new file in order of for (int i = 0; i < nameMapCount.size(); i++) { out << ">" << nameMapCount[i].name << endl << nameMapCount[i].seq << endl; + nameAbund[nameMapCount[i].name] = nameMapCount[i].numIdentical; } out.close(); rename(newFasta.c_str(), fastaFile.c_str()); - return fastaFile; + return nameAbund; } catch(exception& e) { diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index 6ca0310..e5ffeb6 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -44,7 +44,7 @@ private: int driver(linePair*, string, string, string, string); int createProcesses(string, string, string, string); int divideInHalf(Sequence, string&, string&); - string sortFastaFile(string, string); + map sortFastaFile(string, string); #ifdef USE_MPI int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); -- 2.39.2