X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=splitmatrix.cpp;fp=splitmatrix.cpp;h=0000000000000000000000000000000000000000;hb=4a877efa127e56e81a21f53cfdbbfd3bfbe8c4ff;hp=dabcd0469461f25a79ac707cf0e785c19974cc1e;hpb=a6cf29fa4dac0909c7582cb1094151d34093ee76;p=mothur.git diff --git a/splitmatrix.cpp b/splitmatrix.cpp deleted file mode 100644 index dabcd04..0000000 --- a/splitmatrix.cpp +++ /dev/null @@ -1,907 +0,0 @@ -/* - * splitmatrix.cpp - * Mothur - * - * Created by westcott on 5/19/10. - * Copyright 2010 Schloss Lab. All rights reserved. - * - */ - -#include "splitmatrix.h" -#include "phylotree.h" -#include "distancecommand.h" -#include "seqsummarycommand.h" - -/***********************************************************************/ - -SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){ - m = MothurOut::getInstance(); - distFile = distfile; - cutoff = c; - namefile = name; - method = t; - taxFile = tax; - large = l; -} -/***********************************************************************/ - -SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, string output){ - m = MothurOut::getInstance(); - fastafile = ffile; - namefile = name; - taxFile = tax; - cutoff = c; //tax level cutoff - distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that - method = t; - processors = p; - outputDir = output; -} - -/***********************************************************************/ - -int SplitMatrix::split(){ - try { - - if (method == "distance") { - splitDistance(); - }else if ((method == "classify") || (method == "fasta")) { - splitClassify(); - }else { - m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine(); - map temp; - temp[distFile] = namefile; - dists.push_back(temp); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "split"); - exit(1); - } -} -/***********************************************************************/ -int SplitMatrix::splitDistance(){ - try { - - if (large) { splitDistanceLarge(); } - else { splitDistanceRAM(); } - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "splitDistance"); - exit(1); - } -} - -/***********************************************************************/ -int SplitMatrix::splitClassify(){ - try { - cutoff = int(cutoff); - - map seqGroup; - map::iterator it; - map::iterator it2; - - int numGroups = 0; - - //build tree from users taxonomy file - PhyloTree* phylo = new PhyloTree(); - - ifstream in; - m->openInputFile(taxFile, in); - - //read in users taxonomy file and add sequences to tree - string seqname, tax; - while(!in.eof()){ - in >> seqname >> tax; m->gobble(in); - phylo->addSeqToTree(seqname, tax); - } - in.close(); - - phylo->assignHeirarchyIDs(0); - - //make sure the cutoff is not greater than maxlevel - if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); } - - //for each node in tree - for (int i = 0; i < phylo->getNumNodes(); i++) { - - //is this node within the cutoff - TaxNode taxon = phylo->get(i); - - if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences - if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton - for (int j = 0; j < taxon.accessions.size(); j++) { - seqGroup[taxon.accessions[j]] = numGroups; - } - numGroups++; - } - } - } - - delete phylo; - - if (method == "classify") { - splitDistanceFileByTax(seqGroup, numGroups); - }else { - createDistanceFilesFromTax(seqGroup, numGroups); - } - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "splitClassify"); - exit(1); - } -} -/***********************************************************************/ -int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numGroups){ - try { - map copyGroups = seqGroup; - map::iterator it; - set names; - - for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - m->mothurRemove((fastafile + "." + toString(i) + ".temp")); - } - - ifstream in; - m->openInputFile(fastafile, in); - - //parse fastafile - ofstream outFile; - while (!in.eof()) { - Sequence query(in); m->gobble(in); - if (query.getName() != "") { - - it = seqGroup.find(query.getName()); - - //save names in case no namefile is given - if (namefile == "") { names.insert(query.getName()); } - - if (it != seqGroup.end()) { //not singleton - m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile); - query.printSequence(outFile); - outFile.close(); - - copyGroups.erase(query.getName()); - } - } - } - in.close(); - - //warn about sequence in groups that are not in fasta file - for(it = copyGroups.begin(); it != copyGroups.end(); it++) { - m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine(); - exit(1); - } - - copyGroups.clear(); - - //process each distance file - for (int i = 0; i < numGroups; i++) { - - string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); - if (outputDir != "") { options += ", outputdir=" + outputDir; } - - Command* command = new DistanceCommand(options); - - command->execute(); - delete command; - - m->mothurRemove((fastafile + "." + toString(i) + ".temp")); - - //remove old names files just in case - m->mothurRemove((namefile + "." + toString(i) + ".temp")); - } - - singleton = namefile + ".extra.temp"; - ofstream remainingNames; - m->openOutputFile(singleton, remainingNames); - - bool wroteExtra = false; - - ifstream bigNameFile; - m->openInputFile(namefile, bigNameFile); - - string name, nameList; - while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; m->gobble(bigNameFile); - - //did this sequence get assigned a group - it = seqGroup.find(name); - - if (it != seqGroup.end()) { - m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); - outFile << name << '\t' << nameList << endl; - outFile.close(); - }else{ - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - } - bigNameFile.close(); - - for(int i=0;ihasPath(fastafile); } - string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; - - //if there are valid distances - ifstream fileHandle; - fileHandle.open(tempDistFile.c_str()); - if(fileHandle) { - m->gobble(fileHandle); - if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff - map temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - }else { - ifstream in; - m->openInputFile(tempNameFile, in); - - while(!in.eof()) { - in >> name >> nameList; m->gobble(in); - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - in.close(); - m->mothurRemove(tempNameFile); - } - } - fileHandle.close(); - } - - remainingNames.close(); - if (!wroteExtra) { - m->mothurRemove(singleton); - singleton = "none"; - } - - if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax"); - exit(1); - } -} -/***********************************************************************/ -int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroups){ - try { - map::iterator it; - map::iterator it2; - - ifstream dFile; - m->openInputFile(distFile, dFile); - ofstream outFile; - - for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - m->mothurRemove((distFile + "." + toString(i) + ".temp")); - } - - //for buffering the io to improve speed - //allow for 10 dists to be stored, then output. - vector outputs; outputs.resize(numGroups, ""); - vector numOutputs; numOutputs.resize(numGroups, 0); - - //you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match - //this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value - vector validDistances; validDistances.resize(numGroups, false); - - //for each distance - while(dFile){ - string seqA, seqB; - float dist; - - if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } - - dFile >> seqA >> seqB >> dist; m->gobble(dFile); - - //if both sequences are in the same group then they are within the cutoff - it = seqGroup.find(seqA); - it2 = seqGroup.find(seqB); - - if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons - if (it->second == it2->second) { //they are from the same group so add the distance - if (numOutputs[it->second] > 30) { - m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile); - outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl; - outFile.close(); - outputs[it->second] = ""; - numOutputs[it->second] = 0; - validDistances[it->second] = true; - }else{ - outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; - numOutputs[it->second]++; - } - } - } - } - dFile.close(); - - for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - m->mothurRemove((namefile + "." + toString(i) + ".temp")); - - //write out any remaining buffers - if (numOutputs[i] > 0) { - m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile); - outFile << outputs[i]; - outFile.close(); - outputs[i] = ""; - numOutputs[i] = 0; - validDistances[i] = true; - } - } - - ifstream bigNameFile; - m->openInputFile(namefile, bigNameFile); - - singleton = namefile + ".extra.temp"; - ofstream remainingNames; - m->openOutputFile(singleton, remainingNames); - - bool wroteExtra = false; - - string name, nameList; - while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; m->gobble(bigNameFile); - - //did this sequence get assigned a group - it = seqGroup.find(name); - - if (it != seqGroup.end()) { - m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); - outFile << name << '\t' << nameList << endl; - outFile.close(); - }else{ - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - } - bigNameFile.close(); - - for(int i=0;i temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - }else{ - ifstream in; - m->openInputFile(tempNameFile, in); - - while(!in.eof()) { - in >> name >> nameList; m->gobble(in); - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - in.close(); - m->mothurRemove(tempNameFile); - } - } - - remainingNames.close(); - - if (!wroteExtra) { - m->mothurRemove(singleton); - singleton = "none"; - } - - if (m->control_pressed) { - for (int i = 0; i < dists.size(); i++) { - m->mothurRemove((dists[i].begin()->first)); - m->mothurRemove((dists[i].begin()->second)); - } - dists.clear(); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax"); - exit(1); - } -} -/***********************************************************************/ -int SplitMatrix::splitDistanceLarge(){ - try { - vector > groups; - - //for buffering the io to improve speed - //allow for 30 dists to be stored, then output. - vector outputs; - vector numOutputs; - vector wroteOutPut; - - int numGroups = 0; - - ofstream outFile; - ifstream dFile; - m->openInputFile(distFile, dFile); - - while(dFile){ - string seqA, seqB; - float dist; - - dFile >> seqA >> seqB >> dist; - - if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; } - - if(dist < cutoff){ - //cout << "in cutoff: " << dist << endl; - int groupIDA = -1; - int groupIDB = -1; - int groupID = -1; - - for(int i=0;i::iterator aIt = groups[i].find(seqA); - set::iterator bIt = groups[i].find(seqB); - - if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i] - groups[i].insert(seqB); - groupIDA = i; - groupID = groupIDA; - - //cout << "in aIt: " << groupID << endl; - // break; - } - else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i] - groups[i].insert(seqA); - groupIDB = i; - groupID = groupIDB; - - // cout << "in bIt: " << groupID << endl; - // break; - } - - if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to - if(groupIDA < groupIDB){ - // cout << "A: " << groupIDA << "\t" << groupIDB << endl; - groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA - groups[groupIDB].clear(); - groupID = groupIDA; - } - else{ - // cout << "B: " << groupIDA << "\t" << groupIDB << endl; - groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB - groups[groupIDA].clear(); - groupID = groupIDB; - } - break; - } - } - - //windows is gonna gag on the reuse of outFile, will need to make it local... - - if(groupIDA == -1 && groupIDB == -1){ //we need a new group - set newGroup; - newGroup.insert(seqA); - newGroup.insert(seqB); - groups.push_back(newGroup); - - string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; - outputs.push_back(tempOut); - numOutputs.push_back(1); - wroteOutPut.push_back(false); - - numGroups++; - } - else{ - string fileName = distFile + "." + toString(groupID) + ".temp"; - - //have we reached the max buffer size - if (numOutputs[groupID] > 60) { //write out sequence - outFile.open(fileName.c_str(), ios::app); - outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl; - outFile.close(); - - outputs[groupID] = ""; - numOutputs[groupID] = 0; - wroteOutPut[groupID] = true; - }else { - outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; - numOutputs[groupID]++; - } - - if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above - string row, column, distance; - if(groupIDAmothurRemove(fileName2); - - //write out the merged memory - if (numOutputs[groupID] > 60) { - outFile << outputs[groupID]; - outputs[groupID] = ""; - numOutputs[groupID] = 0; - } - - outFile.close(); - - wroteOutPut[groupID] = true; - wroteOutPut[groupIDB] = false; - }else{ } //just merge b's memory with a's memory - } - else{ - numOutputs[groupID] += numOutputs[groupIDA]; - outputs[groupID] += outputs[groupIDA]; - - outputs[groupIDA] = ""; - numOutputs[groupIDA] = 0; - - if (wroteOutPut[groupIDA]) { - string fileName2 = distFile + "." + toString(groupIDA) + ".temp"; - ifstream fileB(fileName2.c_str(), ios::ate); - - outFile.open(fileName.c_str(), ios::app); - - long size; - char* memblock; - - size = fileB.tellg(); - - fileB.seekg (0, ios::beg); - - int numRead = size / 1024; - int lastRead = size % 1024; - - for (int i = 0; i < numRead; i++) { - - memblock = new char [1024]; - - fileB.read (memblock, 1024); - string temp = memblock; - outFile << temp.substr(0, 1024); - - delete memblock; - } - - memblock = new char [lastRead]; - - fileB.read (memblock, lastRead); - - //not sure why but it will read more than lastRead char...?? - string temp = memblock; - outFile << temp.substr(0, lastRead); - - delete memblock; - - fileB.close(); - m->mothurRemove(fileName2); - - //write out the merged memory - if (numOutputs[groupID] > 60) { - outFile << outputs[groupID]; - outputs[groupID] = ""; - numOutputs[groupID] = 0; - } - - outFile.close(); - - wroteOutPut[groupID] = true; - wroteOutPut[groupIDA] = false; - }else { } //just merge memory - } - } - } - } - m->gobble(dFile); - } - dFile.close(); - - for (int i = 0; i < numGroups; i++) { - if (numOutputs[i] > 0) { - string fileName = distFile + "." + toString(i) + ".temp"; - outFile.open(fileName.c_str(), ios::app); - outFile << outputs[i]; - outFile.close(); - } - } - - splitNames(groups); - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "splitDistanceLarge"); - exit(1); - } -} -//******************************************************************************************************************** -int SplitMatrix::splitNames(vector >& groups){ - try { - int numGroups = groups.size(); - - ifstream bigNameFile(namefile.c_str()); - if(!bigNameFile){ - cerr << "Error: We can't open the name file\n"; - exit(1); - } - - map nameMap; - string name, nameList; - while(bigNameFile){ - bigNameFile >> name >> nameList; - nameMap[name] = nameList; - m->gobble(bigNameFile); - } - bigNameFile.close(); - - for(int i=0;i 0){ - string fileName = namefile + "." + toString(i) + ".temp"; - ofstream smallNameFile(fileName.c_str(), ios::ate); - - for(set::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){ - map::iterator nIt = nameMap.find(*gIt); - if (nIt != nameMap.end()) { - smallNameFile << nIt->first << '\t' << nIt->second << endl; - nameMap.erase(nIt); - }else{ - m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1); - } - } - smallNameFile.close(); - } - } - - //names of singletons - if (nameMap.size() != 0) { - singleton = namefile + ".extra.temp"; - ofstream remainingNames(singleton.c_str(), ios::ate); - for(map::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){ - remainingNames << nIt->first << '\t' << nIt->second << endl; - } - remainingNames.close(); - }else { singleton = "none"; } - - for(int i=0;i 0){ - string tempNameFile = namefile + "." + toString(i) + ".temp"; - string tempDistFile = distFile + "." + toString(i) + ".temp"; - - map temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - } - } - - if (m->control_pressed) { - for (int i = 0; i < dists.size(); i++) { - m->mothurRemove((dists[i].begin()->first)); - m->mothurRemove((dists[i].begin()->second)); - } - dists.clear(); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "splitNames"); - exit(1); - } -} -//******************************************************************************************************************** -int SplitMatrix::splitDistanceRAM(){ - try { - vector > groups; - vector outputs; - - int numGroups = 0; - - ifstream dFile; - m->openInputFile(distFile, dFile); - - while(dFile){ - string seqA, seqB; - float dist; - - dFile >> seqA >> seqB >> dist; - - if (m->control_pressed) { dFile.close(); for(int i=0;i 0){ m->mothurRemove((distFile + "." + toString(i) + ".temp")); } } return 0; } - - if(dist < cutoff){ - //cout << "in cutoff: " << dist << endl; - int groupIDA = -1; - int groupIDB = -1; - int groupID = -1; - - for(int i=0;i::iterator aIt = groups[i].find(seqA); - set::iterator bIt = groups[i].find(seqB); - - if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i] - groups[i].insert(seqB); - groupIDA = i; - groupID = groupIDA; - - //cout << "in aIt: " << groupID << endl; - // break; - } - else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i] - groups[i].insert(seqA); - groupIDB = i; - groupID = groupIDB; - - // cout << "in bIt: " << groupID << endl; - // break; - } - - if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to - if(groupIDA < groupIDB){ - // cout << "A: " << groupIDA << "\t" << groupIDB << endl; - groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA - groups[groupIDB].clear(); - groupID = groupIDA; - } - else{ - // cout << "B: " << groupIDA << "\t" << groupIDB << endl; - groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB - groups[groupIDA].clear(); - groupID = groupIDB; - } - break; - } - } - - //windows is gonna gag on the reuse of outFile, will need to make it local... - - if(groupIDA == -1 && groupIDB == -1){ //we need a new group - set newGroup; - newGroup.insert(seqA); - newGroup.insert(seqB); - groups.push_back(newGroup); - - string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; - outputs.push_back(tempOut); - numGroups++; - } - else{ - - outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n'; - - if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above - string row, column, distance; - if(groupIDAgobble(dFile); - } - dFile.close(); - - for (int i = 0; i < numGroups; i++) { - if (outputs[i] != "") { - ofstream outFile; - string fileName = distFile + "." + toString(i) + ".temp"; - outFile.open(fileName.c_str(), ios::ate); - outFile << outputs[i]; - outFile.close(); - } - } - - splitNames(groups); - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "splitDistanceRAM"); - exit(1); - } -} -//******************************************************************************************************************** -//sorts biggest to smallest -inline bool compareFileSizes(map left, map right){ - - FILE * pFile; - long leftsize = 0; - - //get num bytes in file - string filename = left.begin()->first; - pFile = fopen (filename.c_str(),"rb"); - string error = "Error opening " + filename; - if (pFile==NULL) perror (error.c_str()); - else{ - fseek (pFile, 0, SEEK_END); - leftsize=ftell (pFile); - fclose (pFile); - } - - FILE * pFile2; - long rightsize = 0; - - //get num bytes in file - filename = right.begin()->first; - pFile2 = fopen (filename.c_str(),"rb"); - error = "Error opening " + filename; - if (pFile2==NULL) perror (error.c_str()); - else{ - fseek (pFile2, 0, SEEK_END); - rightsize=ftell (pFile2); - fclose (pFile2); - } - - return (leftsize > rightsize); -} -/***********************************************************************/ -//returns map of distance files -> namefile sorted by distance file size -vector< map< string, string> > SplitMatrix::getDistanceFiles(){ - try { - - sort(dists.begin(), dists.end(), compareFileSizes); - - return dists; - } - catch(exception& e) { - m->errorOut(e, "SplitMatrix", "getDistanceFiles"); - exit(1); - } -} -/***********************************************************************/ -SplitMatrix::~SplitMatrix(){} -/***********************************************************************/ -