X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=splitmatrix.cpp;h=f6b5c4d81cf5446adc5e5253808380be7ebbf28d;hp=a4e1f98581b9ce947dd53ad016dc347c71c99ffe;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=260ae19c36cb11a53ddc5a75b5e507f8dd8b31d6 diff --git a/splitmatrix.cpp b/splitmatrix.cpp index a4e1f98..f6b5c4d 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -10,28 +10,33 @@ #include "splitmatrix.h" #include "phylotree.h" #include "distancecommand.h" +#include "seqsummarycommand.h" /***********************************************************************/ -SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){ +SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){ m = MothurOut::getInstance(); distFile = distfile; cutoff = c; namefile = name; method = t; taxFile = tax; + countfile = count; large = l; } /***********************************************************************/ -SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p, string output){ +SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){ m = MothurOut::getInstance(); fastafile = ffile; namefile = name; + countfile = count; taxFile = tax; - cutoff = c; + cutoff = c; //tax level cutoff + distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that method = t; processors = p; + classic = cl; outputDir = output; } @@ -47,7 +52,8 @@ int SplitMatrix::split(){ }else { m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine(); map temp; - temp[distFile] = namefile; + if (namefile != "") { temp[distFile] = namefile; } + else { temp[distFile] = countfile; } dists.push_back(temp); } @@ -64,6 +70,8 @@ int SplitMatrix::splitDistance(){ if (large) { splitDistanceLarge(); } else { splitDistanceRAM(); } + + return 0; } catch(exception& e) { @@ -76,7 +84,7 @@ int SplitMatrix::splitDistance(){ int SplitMatrix::splitClassify(){ try { cutoff = int(cutoff); - + map seqGroup; map::iterator it; map::iterator it2; @@ -86,16 +94,13 @@ int SplitMatrix::splitClassify(){ //build tree from users taxonomy file PhyloTree* phylo = new PhyloTree(); - ifstream in; - m->openInputFile(taxFile, in); - - //read in users taxonomy file and add sequences to tree - string seqname, tax; - while(!in.eof()){ - in >> seqname >> tax; m->gobble(in); - phylo->addSeqToTree(seqname, tax); - } - in.close(); + map temp; + m->readTax(taxFile, temp); + + for (map::iterator itTemp = temp.begin(); itTemp != temp.end();) { + phylo->addSeqToTree(itTemp->first, itTemp->second); + temp.erase(itTemp++); + } phylo->assignHeirarchyIDs(0); @@ -142,7 +147,7 @@ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numG set names; for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - remove((fastafile + "." + toString(i) + ".temp").c_str()); + m->mothurRemove((fastafile + "." + toString(i) + ".temp")); } ifstream in; @@ -157,7 +162,7 @@ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numG it = seqGroup.find(query.getName()); //save names in case no namefile is given - if (namefile == "") { names.insert(query.getName()); } + if ((namefile == "") && (countfile == "")) { names.insert(query.getName()); } if (it != seqGroup.end()) { //not singleton m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile); @@ -177,85 +182,46 @@ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numG } copyGroups.clear(); - + //process each distance file for (int i = 0; i < numGroups; i++) { - string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(cutoff); + string options = ""; + if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; } + else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); } + if (outputDir != "") { options += ", outputdir=" + outputDir; } + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + Command* command = new DistanceCommand(options); + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + command->execute(); delete command; - remove((fastafile + "." + toString(i) + ".temp").c_str()); + m->mothurRemove((fastafile + "." + toString(i) + ".temp")); //remove old names files just in case - remove((namefile + "." + toString(i) + ".temp").c_str()); - } - - singleton = namefile + ".extra.temp"; - ofstream remainingNames; - m->openOutputFile(singleton, remainingNames); - - bool wroteExtra = false; - - ifstream bigNameFile; - m->openInputFile(namefile, bigNameFile); - - string name, nameList; - while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; m->gobble(bigNameFile); - - //did this sequence get assigned a group - it = seqGroup.find(name); - - if (it != seqGroup.end()) { - m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); - outFile << name << '\t' << nameList << endl; - outFile.close(); - }else{ - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - } - bigNameFile.close(); - - for(int i=0;igetRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; - - //if there are valid distances - ifstream fileHandle; - fileHandle.open(tempDistFile.c_str()); - if(fileHandle) { - m->gobble(fileHandle); - if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff - map temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - }else { - ifstream in; - m->openInputFile(tempNameFile, in); - - while(!in.eof()) { - in >> name >> nameList; m->gobble(in); - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - in.close(); - remove(tempNameFile.c_str()); - } - } - fileHandle.close(); - } - - remainingNames.close(); - if (!wroteExtra) { - remove(singleton.c_str()); - singleton = "none"; + if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); } + else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); } } - - if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); } + + //restore old fasta file name since dist.seqs overwrites it with the temp files + m->setFastaFile(fastafile); + + vector tempDistFiles; + for(int i=0;ihasPath(fastafile); } + string tempDistFile = ""; + if (classic) { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";} + else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; } + tempDistFiles.push_back(tempDistFile); + } + + splitNames(seqGroup, numGroups, tempDistFiles); + + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } return 0; } @@ -270,12 +236,13 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup map::iterator it; map::iterator it2; + ofstream outFile; ifstream dFile; m->openInputFile(distFile, dFile); - ofstream outFile; + for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - remove((distFile + "." + toString(i) + ".temp").c_str()); + m->mothurRemove((distFile + "." + toString(i) + ".temp")); } //for buffering the io to improve speed @@ -292,7 +259,7 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup string seqA, seqB; float dist; - if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } } + if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } dFile >> seqA >> seqB >> dist; m->gobble(dFile); @@ -317,9 +284,15 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup } } dFile.close(); - + + string inputFile = namefile; + if (countfile != "") { inputFile = countfile; } + + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - remove((namefile + "." + toString(i) + ".temp").c_str()); + string tempDistFile = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(tempDistFile); + m->mothurRemove((inputFile + "." + toString(i) + ".temp")); //write out any remaining buffers if (numOutputs[i] > 0) { @@ -332,67 +305,12 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup } } - ifstream bigNameFile; - m->openInputFile(namefile, bigNameFile); - - singleton = namefile + ".extra.temp"; - ofstream remainingNames; - m->openOutputFile(singleton, remainingNames); - - bool wroteExtra = false; - - string name, nameList; - while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; m->gobble(bigNameFile); - - //did this sequence get assigned a group - it = seqGroup.find(name); - - if (it != seqGroup.end()) { - m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); - outFile << name << '\t' << nameList << endl; - outFile.close(); - }else{ - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - } - bigNameFile.close(); - - for(int i=0;i temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - }else{ - ifstream in; - m->openInputFile(tempNameFile, in); - - while(!in.eof()) { - in >> name >> nameList; m->gobble(in); - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - in.close(); - remove(tempNameFile.c_str()); - } - } - - remainingNames.close(); - - if (!wroteExtra) { - remove(singleton.c_str()); - singleton = "none"; - } - + splitNames(seqGroup, numGroups, tempDistFiles); + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { - remove((dists[i].begin()->first).c_str()); - remove((dists[i].begin()->second).c_str()); + m->mothurRemove((dists[i].begin()->first)); + m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } @@ -417,7 +335,7 @@ int SplitMatrix::splitDistanceLarge(){ int numGroups = 0; - ofstream outFile; + //ofstream outFile; ifstream dFile; m->openInputFile(distFile, dFile); @@ -427,7 +345,7 @@ int SplitMatrix::splitDistanceLarge(){ dFile >> seqA >> seqB >> dist; - if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; } + if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; } if(dist < cutoff){ //cout << "in cutoff: " << dist << endl; @@ -493,6 +411,7 @@ int SplitMatrix::splitDistanceLarge(){ //have we reached the max buffer size if (numOutputs[groupID] > 60) { //write out sequence + ofstream outFile; outFile.open(fileName.c_str(), ios::app); outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl; outFile.close(); @@ -519,7 +438,7 @@ int SplitMatrix::splitDistanceLarge(){ //if groupB is written to file it is above buffer size so read and write to new merged file if (wroteOutPut[groupIDB]) { string fileName2 = distFile + "." + toString(groupIDB) + ".temp"; - ifstream fileB(fileName2.c_str(), ios::ate); + /*ifstream fileB(fileName2.c_str(), ios::ate); outFile.open(fileName.c_str(), ios::app); @@ -554,17 +473,22 @@ int SplitMatrix::splitDistanceLarge(){ outFile << temp.substr(0, lastRead); delete memblock; - fileB.close(); - remove(fileName2.c_str()); + fileB.close();*/ + m->appendFiles(fileName2, fileName); + m->mothurRemove(fileName2); + //write out the merged memory if (numOutputs[groupID] > 60) { - outFile << outputs[groupID]; + ofstream tempOut; + m->openOutputFile(fileName, tempOut); + tempOut << outputs[groupID]; outputs[groupID] = ""; numOutputs[groupID] = 0; + tempOut.close(); } - outFile.close(); + //outFile.close(); wroteOutPut[groupID] = true; wroteOutPut[groupIDB] = false; @@ -579,7 +503,7 @@ int SplitMatrix::splitDistanceLarge(){ if (wroteOutPut[groupIDA]) { string fileName2 = distFile + "." + toString(groupIDA) + ".temp"; - ifstream fileB(fileName2.c_str(), ios::ate); + /*ifstream fileB(fileName2.c_str(), ios::ate); outFile.open(fileName.c_str(), ios::app); @@ -614,17 +538,21 @@ int SplitMatrix::splitDistanceLarge(){ delete memblock; - fileB.close(); - remove(fileName2.c_str()); + fileB.close();*/ + m->appendFiles(fileName2, fileName); + m->mothurRemove(fileName2); //write out the merged memory if (numOutputs[groupID] > 60) { - outFile << outputs[groupID]; + ofstream tempOut; + m->openOutputFile(fileName, tempOut); + tempOut << outputs[groupID]; outputs[groupID] = ""; numOutputs[groupID] = 0; + tempOut.close(); } - outFile.close(); + //outFile.close(); wroteOutPut[groupID] = true; wroteOutPut[groupIDA] = false; @@ -636,17 +564,30 @@ int SplitMatrix::splitDistanceLarge(){ m->gobble(dFile); } dFile.close(); - + + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { + string fileName = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(fileName); + //remove old names files just in case + if (numOutputs[i] > 0) { - string fileName = distFile + "." + toString(i) + ".temp"; + ofstream outFile; outFile.open(fileName.c_str(), ios::app); outFile << outputs[i]; outFile.close(); } } - - splitNames(groups); + + map seqGroup; + for (int i = 0; i < groups.size(); i++) { + for (set::iterator itNames = groups[i].begin(); itNames != groups[i].end();) { + seqGroup[*itNames] = i; + groups[i].erase(itNames++); + } + } + + splitNames(seqGroup, numGroups, tempDistFiles); return 0; } @@ -656,73 +597,104 @@ int SplitMatrix::splitDistanceLarge(){ } } //******************************************************************************************************************** -int SplitMatrix::splitNames(vector >& groups){ +int SplitMatrix::splitNames(map& seqGroup, int numGroups, vector& tempDistFiles){ try { - int numGroups = groups.size(); - - ifstream bigNameFile(namefile.c_str()); - if(!bigNameFile){ - cerr << "Error: We can't open the name file\n"; - exit(1); - } - - map nameMap; - string name, nameList; - while(bigNameFile){ - bigNameFile >> name >> nameList; - nameMap[name] = nameList; - m->gobble(bigNameFile); - } - bigNameFile.close(); - - for(int i=0;i 0){ - string fileName = namefile + "." + toString(i) + ".temp"; - ofstream smallNameFile(fileName.c_str(), ios::ate); - - for(set::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){ - map::iterator nIt = nameMap.find(*gIt); - if (nIt != nameMap.end()) { - smallNameFile << nIt->first << '\t' << nIt->second << endl; - nameMap.erase(nIt); - }else{ - m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1); - } - } - smallNameFile.close(); - } - } - - //names of singletons - if (nameMap.size() != 0) { - singleton = namefile + ".extra.temp"; - ofstream remainingNames(singleton.c_str(), ios::ate); - for(map::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){ - remainingNames << nIt->first << '\t' << nIt->second << endl; - } - remainingNames.close(); - }else { singleton = "none"; } - + ofstream outFile; + map::iterator it; + + string inputFile = namefile; + if (countfile != "") { inputFile = countfile; } + + for(int i=0;imothurRemove((inputFile + "." + toString(i) + ".temp")); } + + singleton = inputFile + ".extra.temp"; + ofstream remainingNames; + m->openOutputFile(singleton, remainingNames); + + bool wroteExtra = false; + + ifstream bigNameFile; + m->openInputFile(inputFile, bigNameFile); + + //grab header line + string headers = ""; + if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); } + + string name, nameList; + while(!bigNameFile.eof()){ + bigNameFile >> name >> nameList; + m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given. + + //did this sequence get assigned a group + it = seqGroup.find(name); + + if (it != seqGroup.end()) { + m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile); + outFile << name << '\t' << nameList << endl; + outFile.close(); + }else{ + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + } + bigNameFile.close(); + for(int i=0;i 0){ - string tempNameFile = namefile + "." + toString(i) + ".temp"; - string tempDistFile = distFile + "." + toString(i) + ".temp"; - + string tempNameFile = inputFile + "." + toString(i) + ".temp"; + string tempDistFile = tempDistFiles[i]; + + //if there are valid distances + ifstream fileHandle; + fileHandle.open(tempDistFile.c_str()); + if(fileHandle) { + m->gobble(fileHandle); + if (!fileHandle.eof()) { //check map temp; + if (countfile != "") { + //add header + ofstream out; + string newtempNameFile = tempNameFile + "2"; + m->openOutputFile(newtempNameFile, out); + out << "Representative_Sequence\ttotal" << endl; + out.close(); + m->appendFiles(tempNameFile, newtempNameFile); + m->mothurRemove(tempNameFile); + m->renameFile(newtempNameFile, tempNameFile); + } temp[tempDistFile] = tempNameFile; dists.push_back(temp); + }else{ + ifstream in; + m->openInputFile(tempNameFile, in); + + while(!in.eof()) { + in >> name >> nameList; m->gobble(in); + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + in.close(); + m->mothurRemove(tempNameFile); } + } + fileHandle.close(); } - if (m->control_pressed) { - for (int i = 0; i < dists.size(); i++) { - remove((dists[i].begin()->first).c_str()); - remove((dists[i].begin()->second).c_str()); - } - dists.clear(); - } + remainingNames.close(); + + if (!wroteExtra) { + m->mothurRemove(singleton); + singleton = "none"; + }else if (countfile != "") { + //add header + ofstream out; + string newtempNameFile = singleton + "2"; + m->openOutputFile(newtempNameFile, out); + out << "Representative_Sequence\ttotal" << endl; + out.close(); + m->appendFiles(singleton, newtempNameFile); + m->mothurRemove(singleton); + m->renameFile(newtempNameFile, singleton); + } return 0; } @@ -748,7 +720,7 @@ int SplitMatrix::splitDistanceRAM(){ dFile >> seqA >> seqB >> dist; - if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; } + if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; } if(dist < cutoff){ //cout << "in cutoff: " << dist << endl; @@ -827,17 +799,27 @@ int SplitMatrix::splitDistanceRAM(){ } dFile.close(); + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { + string fileName = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(fileName); if (outputs[i] != "") { ofstream outFile; - string fileName = distFile + "." + toString(i) + ".temp"; outFile.open(fileName.c_str(), ios::ate); outFile << outputs[i]; outFile.close(); } } - - splitNames(groups); + + map seqGroup; + for (int i = 0; i < groups.size(); i++) { + for (set::iterator itNames = groups[i].begin(); itNames != groups[i].end();) { + seqGroup[*itNames] = i; + groups[i].erase(itNames++); + } + } + + splitNames(seqGroup, numGroups, tempDistFiles); return 0; }