X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=splitmatrix.cpp;h=dabcd0469461f25a79ac707cf0e785c19974cc1e;hb=2bb9267aa4b4ecdf8488b06605cc9f3f36fa4332;hp=0929cb09167aebb2362bc6f29bf01c710d79ec8d;hpb=2a7d1455e8cfe4f67a7173f3a7249762c5436217;p=mothur.git diff --git a/splitmatrix.cpp b/splitmatrix.cpp index 0929cb0..dabcd04 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -9,6 +9,8 @@ #include "splitmatrix.h" #include "phylotree.h" +#include "distancecommand.h" +#include "seqsummarycommand.h" /***********************************************************************/ @@ -21,6 +23,19 @@ SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, stri taxFile = tax; large = l; } +/***********************************************************************/ + +SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){ + m = MothurOut::getInstance(); + fastafile = ffile; + namefile = name; + taxFile = tax; + cutoff = c; //tax level cutoff + distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that + method = t; + processors = p; + outputDir = output; +} /***********************************************************************/ @@ -29,7 +44,7 @@ int SplitMatrix::split(){ if (method == "distance") { splitDistance(); - }else if (method == "classify") { + }else if ((method == "classify") || (method == "fasta")) { splitClassify(); }else { m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine(); @@ -51,6 +66,8 @@ int SplitMatrix::splitDistance(){ if (large) { splitDistanceLarge(); } else { splitDistanceRAM(); } + + return 0; } catch(exception& e) { @@ -63,7 +80,7 @@ int SplitMatrix::splitDistance(){ int SplitMatrix::splitClassify(){ try { cutoff = int(cutoff); - + map seqGroup; map::iterator it; map::iterator it2; @@ -74,13 +91,12 @@ int SplitMatrix::splitClassify(){ PhyloTree* phylo = new PhyloTree(); ifstream in; - openInputFile(taxFile, in); + m->openInputFile(taxFile, in); //read in users taxonomy file and add sequences to tree string seqname, tax; while(!in.eof()){ - in >> seqname >> tax; gobble(in); - + in >> seqname >> tax; m->gobble(in); phylo->addSeqToTree(seqname, tax); } in.close(); @@ -89,13 +105,13 @@ int SplitMatrix::splitClassify(){ //make sure the cutoff is not greater than maxlevel if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); } - + //for each node in tree for (int i = 0; i < phylo->getNumNodes(); i++) { //is this node within the cutoff TaxNode taxon = phylo->get(i); - + if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton for (int j = 0; j < taxon.accessions.size(); j++) { @@ -105,29 +121,187 @@ int SplitMatrix::splitClassify(){ } } } + + delete phylo; + + if (method == "classify") { + splitDistanceFileByTax(seqGroup, numGroups); + }else { + createDistanceFilesFromTax(seqGroup, numGroups); + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SplitMatrix", "splitClassify"); + exit(1); + } +} +/***********************************************************************/ +int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numGroups){ + try { + map copyGroups = seqGroup; + map::iterator it; + set names; + + for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case + m->mothurRemove((fastafile + "." + toString(i) + ".temp")); + } + + ifstream in; + m->openInputFile(fastafile, in); + + //parse fastafile + ofstream outFile; + while (!in.eof()) { + Sequence query(in); m->gobble(in); + if (query.getName() != "") { + + it = seqGroup.find(query.getName()); + + //save names in case no namefile is given + if (namefile == "") { names.insert(query.getName()); } + + if (it != seqGroup.end()) { //not singleton + m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile); + query.printSequence(outFile); + outFile.close(); + + copyGroups.erase(query.getName()); + } + } + } + in.close(); + + //warn about sequence in groups that are not in fasta file + for(it = copyGroups.begin(); it != copyGroups.end(); it++) { + m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine(); + exit(1); + } + + copyGroups.clear(); + + //process each distance file + for (int i = 0; i < numGroups; i++) { + + string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); + if (outputDir != "") { options += ", outputdir=" + outputDir; } + + Command* command = new DistanceCommand(options); + + command->execute(); + delete command; + + m->mothurRemove((fastafile + "." + toString(i) + ".temp")); + + //remove old names files just in case + m->mothurRemove((namefile + "." + toString(i) + ".temp")); + } + + singleton = namefile + ".extra.temp"; + ofstream remainingNames; + m->openOutputFile(singleton, remainingNames); + + bool wroteExtra = false; + + ifstream bigNameFile; + m->openInputFile(namefile, bigNameFile); + + string name, nameList; + while(!bigNameFile.eof()){ + bigNameFile >> name >> nameList; m->gobble(bigNameFile); + + //did this sequence get assigned a group + it = seqGroup.find(name); + + if (it != seqGroup.end()) { + m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); + outFile << name << '\t' << nameList << endl; + outFile.close(); + }else{ + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + } + bigNameFile.close(); + + for(int i=0;ihasPath(fastafile); } + string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; + + //if there are valid distances + ifstream fileHandle; + fileHandle.open(tempDistFile.c_str()); + if(fileHandle) { + m->gobble(fileHandle); + if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff + map temp; + temp[tempDistFile] = tempNameFile; + dists.push_back(temp); + }else { + ifstream in; + m->openInputFile(tempNameFile, in); + + while(!in.eof()) { + in >> name >> nameList; m->gobble(in); + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + in.close(); + m->mothurRemove(tempNameFile); + } + } + fileHandle.close(); + } + + remainingNames.close(); + if (!wroteExtra) { + m->mothurRemove(singleton); + singleton = "none"; + } + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax"); + exit(1); + } +} +/***********************************************************************/ +int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroups){ + try { + map::iterator it; + map::iterator it2; + ifstream dFile; - openInputFile(distFile, dFile); + m->openInputFile(distFile, dFile); ofstream outFile; for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - remove((distFile + "." + toString(i) + ".temp").c_str()); + m->mothurRemove((distFile + "." + toString(i) + ".temp")); } - //for buffering the io to improve speed //allow for 10 dists to be stored, then output. vector outputs; outputs.resize(numGroups, ""); vector numOutputs; numOutputs.resize(numGroups, 0); + //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match + //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value + vector validDistances; validDistances.resize(numGroups, false); + //for each distance while(dFile){ string seqA, seqB; float dist; - if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } } + if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } - dFile >> seqA >> seqB >> dist; gobble(dFile); + dFile >> seqA >> seqB >> dist; m->gobble(dFile); //if both sequences are in the same group then they are within the cutoff it = seqGroup.find(seqA); @@ -135,12 +309,13 @@ int SplitMatrix::splitClassify(){ if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons if (it->second == it2->second) { //they are from the same group so add the distance - if (numOutputs[it->second] > 10) { - openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile); + if (numOutputs[it->second] > 30) { + m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile); outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl; outFile.close(); outputs[it->second] = ""; numOutputs[it->second] = 0; + validDistances[it->second] = true; }else{ outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; numOutputs[it->second]++; @@ -151,36 +326,37 @@ int SplitMatrix::splitClassify(){ dFile.close(); for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - remove((namefile + "." + toString(i) + ".temp").c_str()); + m->mothurRemove((namefile + "." + toString(i) + ".temp")); //write out any remaining buffers - if (numOutputs[it->second] > 0) { - openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile); + if (numOutputs[i] > 0) { + m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile); outFile << outputs[i]; outFile.close(); outputs[i] = ""; numOutputs[i] = 0; + validDistances[i] = true; } } ifstream bigNameFile; - openInputFile(namefile, bigNameFile); + m->openInputFile(namefile, bigNameFile); singleton = namefile + ".extra.temp"; ofstream remainingNames; - openOutputFile(singleton, remainingNames); + m->openOutputFile(singleton, remainingNames); bool wroteExtra = false; string name, nameList; while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; gobble(bigNameFile); + bigNameFile >> name >> nameList; m->gobble(bigNameFile); //did this sequence get assigned a group it = seqGroup.find(name); if (it != seqGroup.end()) { - openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); + m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); outFile << name << '\t' << nameList << endl; outFile.close(); }else{ @@ -189,35 +365,49 @@ int SplitMatrix::splitClassify(){ } } bigNameFile.close(); - remainingNames.close(); - - if (!wroteExtra) { - remove(singleton.c_str()); - singleton = "none"; - } - + for(int i=0;i temp; + temp[tempDistFile] = tempNameFile; + dists.push_back(temp); + }else{ + ifstream in; + m->openInputFile(tempNameFile, in); - map temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); + while(!in.eof()) { + in >> name >> nameList; m->gobble(in); + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + in.close(); + m->mothurRemove(tempNameFile); + } } + remainingNames.close(); + + if (!wroteExtra) { + m->mothurRemove(singleton); + singleton = "none"; + } + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { - remove((dists[i].begin()->first).c_str()); - remove((dists[i].begin()->second).c_str()); + m->mothurRemove((dists[i].begin()->first)); + m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } return 0; - } catch(exception& e) { - m->errorOut(e, "SplitMatrix", "splitClassify"); + m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax"); exit(1); } } @@ -236,7 +426,7 @@ int SplitMatrix::splitDistanceLarge(){ ofstream outFile; ifstream dFile; - openInputFile(distFile, dFile); + m->openInputFile(distFile, dFile); while(dFile){ string seqA, seqB; @@ -244,7 +434,7 @@ int SplitMatrix::splitDistanceLarge(){ dFile >> seqA >> seqB >> dist; - if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; } + if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; } if(dist < cutoff){ //cout << "in cutoff: " << dist << endl; @@ -372,7 +562,7 @@ int SplitMatrix::splitDistanceLarge(){ delete memblock; fileB.close(); - remove(fileName2.c_str()); + m->mothurRemove(fileName2); //write out the merged memory if (numOutputs[groupID] > 60) { @@ -432,7 +622,7 @@ int SplitMatrix::splitDistanceLarge(){ delete memblock; fileB.close(); - remove(fileName2.c_str()); + m->mothurRemove(fileName2); //write out the merged memory if (numOutputs[groupID] > 60) { @@ -450,7 +640,7 @@ int SplitMatrix::splitDistanceLarge(){ } } } - gobble(dFile); + m->gobble(dFile); } dFile.close(); @@ -488,7 +678,7 @@ int SplitMatrix::splitNames(vector >& groups){ while(bigNameFile){ bigNameFile >> name >> nameList; nameMap[name] = nameList; - gobble(bigNameFile); + m->gobble(bigNameFile); } bigNameFile.close(); @@ -535,8 +725,8 @@ int SplitMatrix::splitNames(vector >& groups){ if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { - remove((dists[i].begin()->first).c_str()); - remove((dists[i].begin()->second).c_str()); + m->mothurRemove((dists[i].begin()->first)); + m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } @@ -557,7 +747,7 @@ int SplitMatrix::splitDistanceRAM(){ int numGroups = 0; ifstream dFile; - openInputFile(distFile, dFile); + m->openInputFile(distFile, dFile); while(dFile){ string seqA, seqB; @@ -565,7 +755,7 @@ int SplitMatrix::splitDistanceRAM(){ dFile >> seqA >> seqB >> dist; - if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; } + if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; } if(dist < cutoff){ //cout << "in cutoff: " << dist << endl; @@ -640,7 +830,7 @@ int SplitMatrix::splitDistanceRAM(){ } } } - gobble(dFile); + m->gobble(dFile); } dFile.close();