X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=splitmatrix.cpp;fp=splitmatrix.cpp;h=9e53c51a8f1b6fbba8e755279e68993fb7f03fa5;hb=a0f87c2ae6414af28d4e70b1e6830401eac21bef;hp=cd05cc46c172122ab9ee23490d38eda5e7308c22;hpb=9946a1b4b50969d08ce059b248bdeecafbf989ac;p=mothur.git diff --git a/splitmatrix.cpp b/splitmatrix.cpp index cd05cc4..9e53c51 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -9,9 +9,7 @@ #include "splitmatrix.h" #include "phylotree.h" -#include "sequencedb.h" -#include "onegapdist.h" -#include "dist.h" +#include "distancecommand.h" /***********************************************************************/ @@ -26,12 +24,14 @@ SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, stri } /***********************************************************************/ -SplitMatrix::SplitMatrix(string ffile, string tax, float c, string t){ +SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p){ m = MothurOut::getInstance(); fastafile = ffile; + namefile = name; taxFile = tax; cutoff = c; method = t; + processors = p; } /***********************************************************************/ @@ -125,7 +125,6 @@ int SplitMatrix::splitClassify(){ createDistanceFilesFromTax(seqGroup, numGroups); } - return 0; } @@ -137,36 +136,114 @@ int SplitMatrix::splitClassify(){ /***********************************************************************/ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numGroups){ try { + map copyGroups = seqGroup; map::iterator it; - map::iterator it2; - map seqIndexInFasta; + set names; + + for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case + remove((fastafile + "." + toString(i) + ".temp").c_str()); + } + + ifstream in; + openInputFile(fastafile, in); + + //parse fastafile + ofstream outFile; + while (!in.eof()) { + Sequence query(in); gobble(in); + if (query.getName() != "") { + + it = seqGroup.find(query.getName()); - //read fastafile - SequenceDB alignDB; - - ifstream filehandle; - openInputFile(fastafile, filehandle); - int numSeqs = 0; - while (!filehandle.eof()) { - //input sequence info into sequencedb - Sequence newSequence(filehandle); + //save names in case no namefile is given + if (namefile == "") { names.insert(query.getName()); } - if (newSequence.getName() != "") { - alignDB.push_back(newSequence); - seqIndexInFasta[newSequence.getName()] = numSeqs; - numSeqs++; + if (it != seqGroup.end()) { //not singleton + openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile); + query.printSequence(outFile); + outFile.close(); + + copyGroups.erase(it); + } } + } + in.close(); + + //warn about sequence in groups that are not in fasta file + for(it = copyGroups.begin(); it != copyGroups.end(); it++) { + m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine(); + exit(1); + } + + copyGroups.clear(); + + //process each distance file + for (int i = 0; i < numGroups; i++) { + + string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors); - //takes care of white space - gobble(filehandle); + Command* command = new DistanceCommand(options); + command->execute(); + delete command; + + remove((fastafile + "." + toString(i) + ".temp").c_str()); + + //remove old names files just in case + remove((namefile + "." + toString(i) + ".temp").c_str()); } - filehandle.close(); - Dist* distCalculator = new oneGapDist(); + singleton = namefile + ".extra.temp"; + ofstream remainingNames; + openOutputFile(singleton, remainingNames); + bool wroteExtra = false; + + ifstream bigNameFile; + openInputFile(namefile, bigNameFile); -//still not done.... + string name, nameList; + while(!bigNameFile.eof()){ + bigNameFile >> name >> nameList; gobble(bigNameFile); + + //did this sequence get assigned a group + it = seqGroup.find(name); + + if (it != seqGroup.end()) { + openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); + outFile << name << '\t' << nameList << endl; + outFile.close(); + }else{ + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + } + bigNameFile.close(); + remainingNames.close(); + if (!wroteExtra) { + remove(singleton.c_str()); + singleton = "none"; + } + + for(int i=0;i temp; + temp[tempDistFile] = tempNameFile; + dists.push_back(temp); + } + } + fileHandle.close(); + } + + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); } return 0; } @@ -269,25 +346,37 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup } } bigNameFile.close(); - remainingNames.close(); - - if (!wroteExtra) { - remove(singleton.c_str()); - singleton = "none"; - } - + for(int i=0;i temp; temp[tempDistFile] = tempNameFile; dists.push_back(temp); + }else{ + ifstream in; + openInputFile(tempNameFile, in); + + while(!in.eof()) { + in >> name >> nameList; gobble(in); + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + in.close(); + remove(tempNameFile.c_str()); } } + remainingNames.close(); + + if (!wroteExtra) { + remove(singleton.c_str()); + singleton = "none"; + } + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str());