From a0f87c2ae6414af28d4e70b1e6830401eac21bef Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 14 Jun 2010 14:01:32 +0000 Subject: [PATCH] fixed bug in cluster.split --- chimeraslayer.cpp | 12 +-- chimeraslayer.h | 4 +- clustersplitcommand.cpp | 13 ++-- makefile | 2 +- splitmatrix.cpp | 159 +++++++++++++++++++++++++++++++--------- splitmatrix.h | 3 +- trimseqscommand.cpp | 2 +- 7 files changed, 143 insertions(+), 52 deletions(-) diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 5295eac..b92e8c8 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -196,7 +196,7 @@ int ChimeraSlayer::print(ostream& out, ostream& outAcc) { } } - printBlock(chimeraResults[0], out); + printBlock(chimeraResults[0], chimeraFlag, out); out << endl; }else { out << querySeq->getName() << "\tno" << endl; } @@ -240,7 +240,7 @@ int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { } } - outputString = getBlock(chimeraResults[0]); + outputString = getBlock(chimeraResults[0], chimeraFlag); outputString += "\n"; //cout << outputString << endl; //write to output file @@ -387,7 +387,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) { } } //*************************************************************************************************************** -void ChimeraSlayer::printBlock(data_struct data, ostream& out){ +void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){ try { //out << ":)\n"; @@ -399,7 +399,7 @@ void ChimeraSlayer::printBlock(data_struct data, ostream& out){ out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t'; out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t'; - out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t'; + out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t'; //out << "Similarity of parents: " << data.ab << endl; //out << "Similarity of query to parentA: " << data.qa << endl; @@ -422,7 +422,7 @@ void ChimeraSlayer::printBlock(data_struct data, ostream& out){ } } //*************************************************************************************************************** -string ChimeraSlayer::getBlock(data_struct data){ +string ChimeraSlayer::getBlock(data_struct data, string flag){ try { string outputString = ""; @@ -433,7 +433,7 @@ string ChimeraSlayer::getBlock(data_struct data){ outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t"; outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t"; - outputString += "yes\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t"; + outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t"; return outputString; } diff --git a/chimeraslayer.h b/chimeraslayer.h index 3ce4cce..e4615ee 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -50,8 +50,8 @@ class ChimeraSlayer : public Chimera { int window, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, increment; float divR; - void printBlock(data_struct, ostream&); - string getBlock(data_struct); + void printBlock(data_struct, string, ostream&); + string getBlock(data_struct, string); }; diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 363b695..f6bf105 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -128,6 +128,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { if (fastafile != "") { if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; } + if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; } } //check for optional parameter and set defaults @@ -195,7 +196,7 @@ void ClusterSplitCommand::help(){ m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n"); m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n"); m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n"); - m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file and taxonomy file. \n"); + m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file. \n"); m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n"); m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n"); m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n"); @@ -270,9 +271,9 @@ int ClusterSplitCommand::execute(){ //split matrix into non-overlapping groups SplitMatrix* split; - if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); } - else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); } - else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, taxFile, taxLevelCutoff, splitmethod); } + if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); } + else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); } + else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors); } else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; } split->split(); @@ -775,8 +776,8 @@ vector ClusterSplitCommand::cluster(vector< map > distNa listFileNames.clear(); return listFileNames; } - //remove(thisDistFile.c_str()); - //remove(thisNamefile.c_str()); + remove(thisDistFile.c_str()); + remove(thisNamefile.c_str()); } diff --git a/makefile b/makefile index ef5cfe7..3e83112 100644 --- a/makefile +++ b/makefile @@ -463,7 +463,7 @@ mothur : \ ./logsd.o\ ./geom.o\ ./setlogfilecommand.o\ - -o mothur + -o ../Release/mothur clean : rm \ diff --git a/splitmatrix.cpp b/splitmatrix.cpp index cd05cc4..9e53c51 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -9,9 +9,7 @@ #include "splitmatrix.h" #include "phylotree.h" -#include "sequencedb.h" -#include "onegapdist.h" -#include "dist.h" +#include "distancecommand.h" /***********************************************************************/ @@ -26,12 +24,14 @@ SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, stri } /***********************************************************************/ -SplitMatrix::SplitMatrix(string ffile, string tax, float c, string t){ +SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p){ m = MothurOut::getInstance(); fastafile = ffile; + namefile = name; taxFile = tax; cutoff = c; method = t; + processors = p; } /***********************************************************************/ @@ -125,7 +125,6 @@ int SplitMatrix::splitClassify(){ createDistanceFilesFromTax(seqGroup, numGroups); } - return 0; } @@ -137,36 +136,114 @@ int SplitMatrix::splitClassify(){ /***********************************************************************/ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numGroups){ try { + map copyGroups = seqGroup; map::iterator it; - map::iterator it2; - map seqIndexInFasta; + set names; + + for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case + remove((fastafile + "." + toString(i) + ".temp").c_str()); + } + + ifstream in; + openInputFile(fastafile, in); + + //parse fastafile + ofstream outFile; + while (!in.eof()) { + Sequence query(in); gobble(in); + if (query.getName() != "") { + + it = seqGroup.find(query.getName()); - //read fastafile - SequenceDB alignDB; - - ifstream filehandle; - openInputFile(fastafile, filehandle); - int numSeqs = 0; - while (!filehandle.eof()) { - //input sequence info into sequencedb - Sequence newSequence(filehandle); + //save names in case no namefile is given + if (namefile == "") { names.insert(query.getName()); } - if (newSequence.getName() != "") { - alignDB.push_back(newSequence); - seqIndexInFasta[newSequence.getName()] = numSeqs; - numSeqs++; + if (it != seqGroup.end()) { //not singleton + openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile); + query.printSequence(outFile); + outFile.close(); + + copyGroups.erase(it); + } } + } + in.close(); + + //warn about sequence in groups that are not in fasta file + for(it = copyGroups.begin(); it != copyGroups.end(); it++) { + m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine(); + exit(1); + } + + copyGroups.clear(); + + //process each distance file + for (int i = 0; i < numGroups; i++) { + + string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors); - //takes care of white space - gobble(filehandle); + Command* command = new DistanceCommand(options); + command->execute(); + delete command; + + remove((fastafile + "." + toString(i) + ".temp").c_str()); + + //remove old names files just in case + remove((namefile + "." + toString(i) + ".temp").c_str()); } - filehandle.close(); - Dist* distCalculator = new oneGapDist(); + singleton = namefile + ".extra.temp"; + ofstream remainingNames; + openOutputFile(singleton, remainingNames); + bool wroteExtra = false; + + ifstream bigNameFile; + openInputFile(namefile, bigNameFile); -//still not done.... + string name, nameList; + while(!bigNameFile.eof()){ + bigNameFile >> name >> nameList; gobble(bigNameFile); + + //did this sequence get assigned a group + it = seqGroup.find(name); + + if (it != seqGroup.end()) { + openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); + outFile << name << '\t' << nameList << endl; + outFile.close(); + }else{ + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + } + bigNameFile.close(); + remainingNames.close(); + if (!wroteExtra) { + remove(singleton.c_str()); + singleton = "none"; + } + + for(int i=0;i temp; + temp[tempDistFile] = tempNameFile; + dists.push_back(temp); + } + } + fileHandle.close(); + } + + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); } return 0; } @@ -269,25 +346,37 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup } } bigNameFile.close(); - remainingNames.close(); - - if (!wroteExtra) { - remove(singleton.c_str()); - singleton = "none"; - } - + for(int i=0;i temp; temp[tempDistFile] = tempNameFile; dists.push_back(temp); + }else{ + ifstream in; + openInputFile(tempNameFile, in); + + while(!in.eof()) { + in >> name >> nameList; gobble(in); + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + in.close(); + remove(tempNameFile.c_str()); } } + remainingNames.close(); + + if (!wroteExtra) { + remove(singleton.c_str()); + singleton = "none"; + } + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); diff --git a/splitmatrix.h b/splitmatrix.h index 9467dc1..b98bdfb 100644 --- a/splitmatrix.h +++ b/splitmatrix.h @@ -20,7 +20,7 @@ class SplitMatrix { public: SplitMatrix(string, string, string, float, string, bool); //column formatted distance file, namesfile, cutoff, method, large - SplitMatrix(string, string, float, string); //fastafile, taxFile, cutoff, method + SplitMatrix(string, string, string, float, string, int); //fastafile, namefile, taxFile, cutoff, method, processors ~SplitMatrix(); int split(); @@ -34,6 +34,7 @@ class SplitMatrix { vector< map< string, string> > dists; float cutoff; bool large; + int processors; int splitDistance(); int splitClassify(); diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 705ac25..71e5c92 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -163,7 +163,7 @@ void TrimSeqsCommand::help(){ m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n"); m->mothurOut("The fasta parameter is required.\n"); - m->mothurOut("The flip parameter .... The default is 0.\n"); + m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"); m->mothurOut("The oligos parameter .... The default is "".\n"); m->mothurOut("The maxambig parameter .... The default is -1.\n"); m->mothurOut("The maxhomop parameter .... The default is 0.\n"); -- 2.39.2