From 529ec122f7cac4af987e121d150b878d7c7a0d5d Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 31 Jul 2012 14:41:10 -0400 Subject: [PATCH] added count file to cluster.classic and cluster.split. modified splitting classes to use the count file. --- clusterclassic.cpp | 199 +++++++++++++++++++++++ clusterclassic.h | 2 + clusterdoturcommand.cpp | 91 +++++++---- clusterdoturcommand.h | 2 +- clustersplitcommand.cpp | 207 +++++++++++++++++------- clustersplitcommand.h | 3 +- splitmatrix.cpp | 345 +++++++++++++++++----------------------- splitmatrix.h | 8 +- 8 files changed, 563 insertions(+), 294 deletions(-) diff --git a/clusterclassic.cpp b/clusterclassic.cpp index 2d1b9a6..ba1f7f6 100644 --- a/clusterclassic.cpp +++ b/clusterclassic.cpp @@ -231,6 +231,205 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { exit(1); } +} +/***********************************************************************/ +int ClusterClassic::readPhylipFile(string filename, CountTable* countTable) { + try { + double distance; + int square; + string name; + vector matrixNames; + + ifstream fileHandle; + m->openInputFile(filename, fileHandle); + + string numTest; + fileHandle >> numTest >> name; + + if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); } + else { convert(numTest, nseqs); } + + + matrixNames.push_back(name); + + if(countTable == NULL){ + list = new ListVector(nseqs); + list->set(0, name); + } + else{ list = new ListVector(countTable->getListVector()); } + + + //initialize distance matrix to cutoff + dMatrix.resize(nseqs); + //rowSmallDists.resize(nseqs, temp); + for (int i = 1; i < nseqs; i++) { + dMatrix[i].resize(i, aboveCutoff); + } + + + char d; + while((d=fileHandle.get()) != EOF){ + + if(isalnum(d)){ + square = 1; + fileHandle.putback(d); + for(int i=0;i> distance; + } + break; + } + if(d == '\n'){ + square = 0; + break; + } + } + + Progress* reading; + + if(square == 0){ + + reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2); + + int index = 0; + + for(int i=1;icontrol_pressed) { fileHandle.close(); delete reading; return 0; } + + fileHandle >> name; + matrixNames.push_back(name); + + + //there's A LOT of repeated code throughout this method... + if(countTable == NULL){ + list->set(i, name); + + for(int j=0;jcontrol_pressed) { delete reading; fileHandle.close(); return 0; } + + fileHandle >> distance; + + if (distance == -1) { distance = 1000000; } + else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + //if(distance < cutoff){ + dMatrix[i][j] = distance; + if (distance < smallDist) { smallDist = distance; } + //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; } + //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; } + //} + index++; + reading->update(index); + } + + } + else{ + for(int j=0;j> distance; + + if (m->control_pressed) { delete reading; fileHandle.close(); return 0; } + + if (distance == -1) { distance = 1000000; } + else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + if (distance < smallDist) { smallDist = distance; } + + int row = countTable->get(matrixNames[i]); + int col = countTable->get(matrixNames[j]); + + if (row < col) { dMatrix[col][row] = distance; } + else { dMatrix[row][col] = distance; } + + index++; + reading->update(index); + } + } + } + } + else{ + + reading = new Progress("Reading matrix: ", nseqs * nseqs); + + int index = nseqs; + + for(int i=1;i> name; + matrixNames.push_back(name); + + if(countTable == NULL){ + list->set(i, name); + for(int j=0;j> distance; + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + if (distance == -1) { distance = 1000000; } + else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + if(j < i){ + if (distance < smallDist) { smallDist = distance; } + + dMatrix[i][j] = distance; + } + index++; + reading->update(index); + } + + } + else{ + + for(int j=0;j> distance; + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + if (distance == -1) { distance = 1000000; } + else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + if(j < i){ + if (distance < smallDist) { smallDist = distance; } + + int row = countTable->get(matrixNames[i]); + int col = countTable->get(matrixNames[j]); + + if (row < col) { dMatrix[col][row] = distance; } + else { dMatrix[row][col] = distance; } + } + index++; + reading->update(index); + } + } + } + } + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + reading->finish(); + delete reading; + + list->setLabel("0"); + rabund = new RAbundVector(); + rabund->setLabel(list->getLabel()); + + for(int i = 0; i < list->getNumBins(); i++) { + if (m->control_pressed) { break; } + vector binNames; + string bin = list->get(i); + m->splitAtComma(bin, binNames); + int total = 0; + for (int j = 0; j < binNames.size(); j++) { total += countTable->getNumSeqs(binNames[j]); } + rabund->push_back(total); + } + + fileHandle.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClusterClassic", "readPhylipFile"); + exit(1); + } + } /***********************************************************************/ //sets smallCol and smallRow, returns distance diff --git a/clusterclassic.h b/clusterclassic.h index a650bbf..eaccb27 100644 --- a/clusterclassic.h +++ b/clusterclassic.h @@ -6,6 +6,7 @@ #include "listvector.hpp" #include "rabundvector.hpp" #include "nameassignment.hpp" +#include "counttable.h" /* * clusterclassic.h @@ -22,6 +23,7 @@ class ClusterClassic { public: ClusterClassic(float, string, bool); int readPhylipFile(string, NameAssignment*); + int readPhylipFile(string, CountTable*); void update(double&); double getSmallDist() { return smallDist; } int getNSeqs() { return nseqs; } diff --git a/clusterdoturcommand.cpp b/clusterdoturcommand.cpp index 9bfb52b..34d2e0e 100644 --- a/clusterdoturcommand.cpp +++ b/clusterdoturcommand.cpp @@ -14,7 +14,8 @@ vector ClusterDoturCommand::setParameters(){ try { CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pphylip); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff); CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision); CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod); @@ -37,7 +38,7 @@ string ClusterDoturCommand::getHelpString(){ try { string helpString = ""; helpString += "The cluster.classic command clusters using the algorithm from dotur. \n"; - helpString += "The cluster.classic command parameter options are phylip, name, method, cuttoff, hard, sim, precision. Phylip is required, unless you have a valid current file.\n"; + helpString += "The cluster.classic command parameter options are phylip, name, count, method, cuttoff, hard, sim, precision. Phylip is required, unless you have a valid current file.\n"; helpString += "The cluster.classic command should be in the following format: \n"; helpString += "cluster.classic(phylip=yourDistanceMatrix, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"; helpString += "The acceptable cluster methods are furthest, nearest, weighted and average. If no method is provided then average is assumed.\n"; @@ -132,7 +133,14 @@ ClusterDoturCommand::ClusterDoturCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } - + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //initialize outputTypes @@ -159,10 +167,17 @@ ClusterDoturCommand::ClusterDoturCommand(string option) { //check for optional parameter and set defaults namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { abort = true; namefile = ""; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.classic command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + string temp; temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; } @@ -204,36 +219,49 @@ int ClusterDoturCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - if(namefile != ""){ + + ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim); + + NameAssignment* nameMap = NULL; + CountTable* ct = NULL; + if(namefile != "") { nameMap = new NameAssignment(namefile); nameMap->readMap(); - }else{ - nameMap = NULL; - } - - //reads phylip file storing data in 2D vector, also fills list and rabund - ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim); - cluster->readPhylipFile(phylipfile, nameMap); - - if (m->control_pressed) { delete cluster; delete list; delete rabund; return 0; } + cluster->readPhylipFile(phylipfile, nameMap); + delete nameMap; + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(countfile); + cluster->readPhylipFile(phylipfile, ct); + delete ct; + }else { + cluster->readPhylipFile(phylipfile, nameMap); + } + tag = cluster->getTag(); + + if (m->control_pressed) { delete cluster; return 0; } list = cluster->getListVector(); rabund = cluster->getRAbundVector(); - + if (outputDir == "") { outputDir += m->hasPath(phylipfile); } fileroot = outputDir + m->getRootName(m->getSimpleName(phylipfile)); string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund"); string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund"); - string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list"); + string listFileName = fileroot+ tag + "."; + if (countfile != "") { listFileName += "unique_"; } + listFileName += getOutputFileNameTag("list"); - m->openOutputFile(sabundFileName, sabundFile); - m->openOutputFile(rabundFileName, rabundFile); + if (countfile == "") { + m->openOutputFile(sabundFileName, sabundFile); + m->openOutputFile(rabundFileName, rabundFile); + outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); + outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); + + } m->openOutputFile(listFileName, listFile); - - outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); - outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); - outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); + outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); float previousDist = 0.00000; float rndPreviousDist = 0.00000; @@ -245,7 +273,8 @@ int ClusterDoturCommand::execute(){ int estart = time(NULL); while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){ - if (m->control_pressed) { delete cluster; delete list; delete rabund; sabundFile.close();rabundFile.close();listFile.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0; } + if (m->control_pressed) { delete cluster; delete list; delete rabund; if(countfile == "") {rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } + listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0; } cluster->update(cutoff); @@ -276,18 +305,14 @@ int ClusterDoturCommand::execute(){ else if(rndPreviousDistceilDist(saveCutoff, precision); } - // else { saveCutoff = m->roundDist(saveCutoff, precision); } - // m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); - //} + delete cluster; delete list; delete rabund; //set list file as new current listfile string current = ""; diff --git a/clusterdoturcommand.h b/clusterdoturcommand.h index ec2f083..0f8657c 100644 --- a/clusterdoturcommand.h +++ b/clusterdoturcommand.h @@ -37,7 +37,7 @@ public: private: bool abort, hard, sim; - string method, fileroot, tag, outputDir, phylipfile, namefile; + string method, fileroot, tag, outputDir, phylipfile, namefile, countfile; double cutoff; int precision, length; ofstream sabundFile, rabundFile, listFile; diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index b097f38..09be4aa 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -16,7 +16,8 @@ vector ClusterSplitCommand::setParameters(){ CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip); CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn); CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel); CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod); @@ -45,7 +46,7 @@ vector ClusterSplitCommand::setParameters(){ string ClusterSplitCommand::getHelpString(){ try { string helpString = ""; - helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n"; + helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n"; helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n"; helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n"; helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n"; @@ -54,7 +55,8 @@ string ClusterSplitCommand::getHelpString(){ helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n"; helpString += "The phylip and column parameter allow you to enter your distance file. \n"; helpString += "The fasta parameter allows you to enter your aligned fasta file. \n"; - helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"; + helpString += "The name parameter allows you to enter your name file. \n"; + helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format"; helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n"; helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n"; helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n"; @@ -196,6 +198,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["fasta"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -210,9 +220,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { abort = true; namefile = "";} else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = "";} + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } fastafile = validParameter.validFile(parameters, "fasta", true); if (fastafile == "not open") { abort = true; } @@ -243,14 +258,20 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { } } else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; } - + + if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + if (columnfile != "") { - if (namefile == "") { + if ((namefile == "") && (countfile == "")) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); + abort = true; + } } } } @@ -265,12 +286,16 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { } } - if (namefile == "") { + if ((namefile == "") && (countfile == "")) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); + abort = true; + } } } } @@ -379,7 +404,7 @@ int ClusterSplitCommand::execute(){ //if no names file given with phylip file, create it ListVector* listToMakeNameFile = convert->getListVector(); - if (namefile == "") { //you need to make a namefile for split matrix + if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix ofstream out; namefile = phylipfile + ".names"; m->openOutputFile(namefile, out); @@ -401,9 +426,9 @@ int ClusterSplitCommand::execute(){ //split matrix into non-overlapping groups SplitMatrix* split; - if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); } - else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); } - else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); } + if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); } + else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); } + else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); } else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; } split->split(); @@ -666,15 +691,21 @@ map ClusterSplitCommand::completeListFile(vector listNames, //read in singletons if (singleton != "none") { - ifstream in; - m->openInputFile(singleton, in); + + ifstream in; + m->openInputFile(singleton, in); string firstCol, secondCol; listSingle = new ListVector(); + + if (countfile != "") { m->getline(in); m->gobble(in); } + while (!in.eof()) { - in >> firstCol >> secondCol; m->gobble(in); - listSingle->push_back(secondCol); + in >> firstCol >> secondCol; m->getline(in); m->gobble(in); + if (countfile == "") { listSingle->push_back(secondCol); } + else { listSingle->push_back(firstCol); } } + in.close(); m->mothurRemove(singleton); @@ -775,15 +806,21 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund"); string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund"); - string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list"); + string listFileName = fileroot+ tag + "."; + if (countfile != "") { listFileName += "unique_"; } + listFileName += getOutputFileNameTag("list"); - m->openOutputFile(sabundFileName, outSabund); - m->openOutputFile(rabundFileName, outRabund); + if (countfile == "") { + m->openOutputFile(sabundFileName, outSabund); + m->openOutputFile(rabundFileName, outRabund); + outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); + outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); + + } m->openOutputFile(listFileName, outList); + outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); + - outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); - outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); - outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); map::iterator itLabel; //for each label needed @@ -794,22 +831,25 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us else { thisLabel = toString(itLabel->first, length-1); } outList << thisLabel << '\t' << itLabel->second << '\t'; - - RAbundVector* rabund = new RAbundVector(); - rabund->setLabel(thisLabel); + + RAbundVector* rabund = NULL; + if (countfile == "") { + rabund = new RAbundVector(); + rabund->setLabel(thisLabel); + } //add in singletons if (listSingle != NULL) { for (int j = 0; j < listSingle->getNumBins(); j++) { outList << listSingle->get(j) << '\t'; - rabund->push_back(m->getNumNames(listSingle->get(j))); + if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); } } } //get the list info from each file for (int k = 0; k < listNames.size(); k++) { - if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } delete rabund; return 0; } + if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; } InputData* input = new InputData(listNames[k], "list"); ListVector* list = input->getListVector(thisLabel); @@ -819,26 +859,28 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us else { for (int j = 0; j < list->getNumBins(); j++) { outList << list->get(j) << '\t'; - rabund->push_back(m->getNumNames(list->get(j))); + if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); } } delete list; } delete input; } - SAbundVector sabund = rabund->getSAbundVector(); - - sabund.print(outSabund); - rabund->print(outRabund); + if (countfile == "") { + SAbundVector sabund = rabund->getSAbundVector(); + sabund.print(outSabund); + rabund->print(outRabund); + } outList << endl; - delete rabund; + if (rabund != NULL) { delete rabund; } } outList.close(); - outRabund.close(); - outSabund.close(); - + if (countfile == "") { + outRabund.close(); + outSabund.close(); + } if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } @@ -1101,16 +1143,25 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine(); - NameAssignment* nameMap = new NameAssignment(thisNamefile); - nameMap->readMap(); - - //reads phylip file storing data in 2D vector, also fills list and rabund + //reads phylip file storing data in 2D vector, also fills list and rabund bool sim = false; ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim); - cluster->readPhylipFile(thisDistFile, nameMap); - tag = cluster->getTag(); - if (m->control_pressed) { delete cluster; return 0; } + NameAssignment* nameMap = NULL; + CountTable* ct = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(thisNamefile); + nameMap->readMap(); + cluster->readPhylipFile(thisDistFile, nameMap); + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(thisNamefile); + cluster->readPhylipFile(thisDistFile, ct); + } + tag = cluster->getTag(); + + if (m->control_pressed) { if(namefile != ""){ delete nameMap; } + else { delete ct; } delete cluster; return 0; } list = cluster->getListVector(); rabund = cluster->getRAbundVector(); @@ -1136,7 +1187,8 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine(); while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){ - if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); return listFileName; } + if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; } + else { delete ct; } return listFileName; } cluster->update(cutoff); @@ -1179,8 +1231,12 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN listFile.close(); - delete cluster; delete nameMap; delete list; delete rabund; - + delete cluster; delete list; delete rabund; + if(namefile != ""){ delete nameMap; } + else { delete ct; } + + m->mothurRemove(thisDistFile); + m->mothurRemove(thisNamefile); return listFileName; @@ -1219,18 +1275,30 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile ReadMatrix* read = new ReadColumnMatrix(thisDistFile); read->setCutoff(cutoff); - NameAssignment* nameMap = new NameAssignment(thisNamefile); - nameMap->readMap(); - read->read(nameMap); - - if (m->control_pressed) { delete read; delete nameMap; return listFileName; } - - list = read->getListVector(); + NameAssignment* nameMap = NULL; + CountTable* ct = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(thisNamefile); + nameMap->readMap(); + read->read(nameMap); + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(thisNamefile); + read->read(ct); + } + + list = read->getListVector(); oldList = *list; - matrix = read->getDMatrix(); + matrix = read->getDMatrix(); + if(countfile != "") { + rabund = new RAbundVector(); + createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list + delete ct; + }else { rabund = new RAbundVector(list->getRAbundVector()); } + delete read; read = NULL; - delete nameMap; nameMap = NULL; + if (namefile != "") { delete nameMap; nameMap = NULL; } #ifdef USE_MPI @@ -1242,8 +1310,6 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine(); - rabund = new RAbundVector(list->getRAbundVector()); - //create cluster if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); } else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); } @@ -1385,3 +1451,24 @@ int ClusterSplitCommand::createMergedDistanceFile(vector< map > } } //********************************************************************************************************************** +int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){ + try { + rabund->setLabel(list->getLabel()); + for(int i = 0; i < list->getNumBins(); i++) { + if (m->control_pressed) { break; } + vector binNames; + string bin = list->get(i); + m->splitAtComma(bin, binNames); + int total = 0; + for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); } + rabund->push_back(total); + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClusterCommand", "createRabund"); + exit(1); + } + +} +//********************************************************************************************************************** diff --git a/clustersplitcommand.h b/clustersplitcommand.h index d46bb8e..29dc69a 100644 --- a/clustersplitcommand.h +++ b/clustersplitcommand.h @@ -47,7 +47,7 @@ private: vector processIDS; //processid vector outputNames; - string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile; + string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, countfile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile; double cutoff, splitcutoff; int precision, length, processors, taxLevelCutoff; bool print_start, abort, hard, large, classic; @@ -62,6 +62,7 @@ private: int mergeLists(vector, map, ListVector*); map completeListFile(vector, string, set&, ListVector*&); int createMergedDistanceFile(vector< map >); + int createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund); }; /////////////////not working for Windows//////////////////////////////////////////////////////////// diff --git a/splitmatrix.cpp b/splitmatrix.cpp index 384b09a..28bc5d4 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -14,21 +14,23 @@ /***********************************************************************/ -SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){ +SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){ m = MothurOut::getInstance(); distFile = distfile; cutoff = c; namefile = name; method = t; taxFile = tax; + countfile = count; large = l; } /***********************************************************************/ -SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, bool cl, string output){ +SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){ m = MothurOut::getInstance(); fastafile = ffile; namefile = name; + countfile = count; taxFile = tax; cutoff = c; //tax level cutoff distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that @@ -50,7 +52,8 @@ int SplitMatrix::split(){ }else { m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine(); map temp; - temp[distFile] = namefile; + if (namefile != "") { temp[distFile] = namefile; } + else { temp[distFile] = countfile; } dists.push_back(temp); } @@ -159,7 +162,7 @@ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numG it = seqGroup.find(query.getName()); //save names in case no namefile is given - if (namefile == "") { names.insert(query.getName()); } + if ((namefile == "") && (countfile == "")) { names.insert(query.getName()); } if (it != seqGroup.end()) { //not singleton m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile); @@ -196,74 +199,21 @@ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numG m->mothurRemove((fastafile + "." + toString(i) + ".temp")); //remove old names files just in case - m->mothurRemove((namefile + "." + toString(i) + ".temp")); + if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); } + else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); } } - - singleton = namefile + ".extra.temp"; - ofstream remainingNames; - m->openOutputFile(singleton, remainingNames); - - bool wroteExtra = false; - - ifstream bigNameFile; - m->openInputFile(namefile, bigNameFile); - - string name, nameList; - while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; m->gobble(bigNameFile); - - //did this sequence get assigned a group - it = seqGroup.find(name); - - if (it != seqGroup.end()) { - m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); - outFile << name << '\t' << nameList << endl; - outFile.close(); - }else{ - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - } - bigNameFile.close(); - - for(int i=0;ihasPath(fastafile); } - string tempDistFile = ""; + + vector tempDistFiles; + for(int i=0;ihasPath(fastafile); } + string tempDistFile = ""; if (classic) { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";} else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; } - - //if there are valid distances - ifstream fileHandle; - fileHandle.open(tempDistFile.c_str()); - if(fileHandle) { - m->gobble(fileHandle); - if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff - map temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - }else { - ifstream in; - m->openInputFile(tempNameFile, in); - - while(!in.eof()) { - in >> name >> nameList; m->gobble(in); - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - in.close(); - m->mothurRemove(tempNameFile); - } - } - fileHandle.close(); - } - - remainingNames.close(); - if (!wroteExtra) { - m->mothurRemove(singleton); - singleton = "none"; - } - + tempDistFiles.push_back(tempDistFile); + } + + splitNames(seqGroup, numGroups, tempDistFiles); + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } return 0; @@ -279,9 +229,10 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup map::iterator it; map::iterator it2; + ofstream outFile; ifstream dFile; m->openInputFile(distFile, dFile); - ofstream outFile; + for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case m->mothurRemove((distFile + "." + toString(i) + ".temp")); @@ -326,9 +277,15 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup } } dFile.close(); - + + string inputFile = namefile; + if (countfile != "") { inputFile = countfile; } + + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - m->mothurRemove((namefile + "." + toString(i) + ".temp")); + string tempDistFile = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(tempDistFile); + m->mothurRemove((inputFile + "." + toString(i) + ".temp")); //write out any remaining buffers if (numOutputs[i] > 0) { @@ -341,63 +298,8 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup } } - ifstream bigNameFile; - m->openInputFile(namefile, bigNameFile); - - singleton = namefile + ".extra.temp"; - ofstream remainingNames; - m->openOutputFile(singleton, remainingNames); - - bool wroteExtra = false; - - string name, nameList; - while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; m->gobble(bigNameFile); - - //did this sequence get assigned a group - it = seqGroup.find(name); - - if (it != seqGroup.end()) { - m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); - outFile << name << '\t' << nameList << endl; - outFile.close(); - }else{ - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - } - bigNameFile.close(); - - for(int i=0;i temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - }else{ - ifstream in; - m->openInputFile(tempNameFile, in); - - while(!in.eof()) { - in >> name >> nameList; m->gobble(in); - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - in.close(); - m->mothurRemove(tempNameFile); - } - } - - remainingNames.close(); - - if (!wroteExtra) { - m->mothurRemove(singleton); - singleton = "none"; - } - + splitNames(seqGroup, numGroups, tempDistFiles); + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); @@ -645,17 +547,29 @@ int SplitMatrix::splitDistanceLarge(){ m->gobble(dFile); } dFile.close(); - + + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { + string fileName = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(fileName); + //remove old names files just in case + if (numOutputs[i] > 0) { - string fileName = distFile + "." + toString(i) + ".temp"; outFile.open(fileName.c_str(), ios::app); outFile << outputs[i]; outFile.close(); } } - - splitNames(groups); + + map seqGroup; + for (int i = 0; i < groups.size(); i++) { + for (set::iterator itNames = groups[i].begin(); itNames != groups[i].end();) { + seqGroup[*itNames] = i; + groups[i].erase(itNames++); + } + } + + splitNames(seqGroup, numGroups, tempDistFiles); return 0; } @@ -665,73 +579,104 @@ int SplitMatrix::splitDistanceLarge(){ } } //******************************************************************************************************************** -int SplitMatrix::splitNames(vector >& groups){ +int SplitMatrix::splitNames(map& seqGroup, int numGroups, vector& tempDistFiles){ try { - int numGroups = groups.size(); - - ifstream bigNameFile(namefile.c_str()); - if(!bigNameFile){ - cerr << "Error: We can't open the name file\n"; - exit(1); - } - - map nameMap; - string name, nameList; - while(bigNameFile){ - bigNameFile >> name >> nameList; - nameMap[name] = nameList; - m->gobble(bigNameFile); - } - bigNameFile.close(); - - for(int i=0;i 0){ - string fileName = namefile + "." + toString(i) + ".temp"; - ofstream smallNameFile(fileName.c_str(), ios::ate); - - for(set::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){ - map::iterator nIt = nameMap.find(*gIt); - if (nIt != nameMap.end()) { - smallNameFile << nIt->first << '\t' << nIt->second << endl; - nameMap.erase(nIt); - }else{ - m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1); - } - } - smallNameFile.close(); - } - } - - //names of singletons - if (nameMap.size() != 0) { - singleton = namefile + ".extra.temp"; - ofstream remainingNames(singleton.c_str(), ios::ate); - for(map::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){ - remainingNames << nIt->first << '\t' << nIt->second << endl; - } - remainingNames.close(); - }else { singleton = "none"; } - + ofstream outFile; + map::iterator it; + + string inputFile = namefile; + if (countfile != "") { inputFile = countfile; } + + for(int i=0;imothurRemove((inputFile + "." + toString(i) + ".temp")); } + + singleton = inputFile + ".extra.temp"; + ofstream remainingNames; + m->openOutputFile(singleton, remainingNames); + + bool wroteExtra = false; + + ifstream bigNameFile; + m->openInputFile(inputFile, bigNameFile); + + //grab header line + string headers = ""; + if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); } + + string name, nameList; + while(!bigNameFile.eof()){ + bigNameFile >> name >> nameList; + m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given. + + //did this sequence get assigned a group + it = seqGroup.find(name); + + if (it != seqGroup.end()) { + m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile); + outFile << name << '\t' << nameList << endl; + outFile.close(); + }else{ + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + } + bigNameFile.close(); + for(int i=0;i 0){ - string tempNameFile = namefile + "." + toString(i) + ".temp"; - string tempDistFile = distFile + "." + toString(i) + ".temp"; - + string tempNameFile = inputFile + "." + toString(i) + ".temp"; + string tempDistFile = tempDistFiles[i]; + + //if there are valid distances + ifstream fileHandle; + fileHandle.open(tempDistFile.c_str()); + if(fileHandle) { + m->gobble(fileHandle); + if (!fileHandle.eof()) { //check map temp; + if (countfile != "") { + //add header + ofstream out; + string newtempNameFile = tempNameFile + "2"; + m->openOutputFile(newtempNameFile, out); + out << headers << endl; + out.close(); + m->appendFiles(tempNameFile, newtempNameFile); + m->mothurRemove(tempNameFile); + m->renameFile(newtempNameFile, tempNameFile); + } temp[tempDistFile] = tempNameFile; dists.push_back(temp); + }else{ + ifstream in; + m->openInputFile(tempNameFile, in); + + while(!in.eof()) { + in >> name >> nameList; m->gobble(in); + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + in.close(); + m->mothurRemove(tempNameFile); } + } + fileHandle.close(); } - if (m->control_pressed) { - for (int i = 0; i < dists.size(); i++) { - m->mothurRemove((dists[i].begin()->first)); - m->mothurRemove((dists[i].begin()->second)); - } - dists.clear(); - } + remainingNames.close(); + + if (!wroteExtra) { + m->mothurRemove(singleton); + singleton = "none"; + }else if (countfile != "") { + //add header + ofstream out; + string newtempNameFile = singleton + "2"; + m->openOutputFile(newtempNameFile, out); + out << headers << endl; + out.close(); + m->appendFiles(singleton, newtempNameFile); + m->mothurRemove(singleton); + m->renameFile(newtempNameFile, singleton); + } return 0; } @@ -836,17 +781,27 @@ int SplitMatrix::splitDistanceRAM(){ } dFile.close(); + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { + string fileName = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(fileName); if (outputs[i] != "") { ofstream outFile; - string fileName = distFile + "." + toString(i) + ".temp"; outFile.open(fileName.c_str(), ios::ate); outFile << outputs[i]; outFile.close(); } } - - splitNames(groups); + + map seqGroup; + for (int i = 0; i < groups.size(); i++) { + for (set::iterator itNames = groups[i].begin(); itNames != groups[i].end();) { + seqGroup[*itNames] = i; + groups[i].erase(itNames++); + } + } + + splitNames(seqGroup, numGroups, tempDistFiles); return 0; } diff --git a/splitmatrix.h b/splitmatrix.h index b8aa551..7b468e9 100644 --- a/splitmatrix.h +++ b/splitmatrix.h @@ -19,8 +19,8 @@ class SplitMatrix { public: - SplitMatrix(string, string, string, float, string, bool); //column formatted distance file, namesfile, cutoff, method, large - SplitMatrix(string, string, string, float, float, string, int, bool, string); //fastafile, namefile, taxFile, taxcutoff, cutoff, method, processors, classic, outputDir + SplitMatrix(string, string, string, string, float, string, bool); //column formatted distance file, namesfile, countfile, cutoff, method, large + SplitMatrix(string, string, string, string, float, float, string, int, bool, string); //fastafile, namefile, countfile, taxFile, taxcutoff, cutoff, method, processors, classic, outputDir ~SplitMatrix(); int split(); @@ -30,7 +30,7 @@ class SplitMatrix { private: MothurOut* m; - string distFile, namefile, singleton, method, taxFile, fastafile, outputDir; + string distFile, namefile, singleton, method, taxFile, fastafile, outputDir, countfile; vector< map< string, string> > dists; float cutoff, distCutoff; bool large, classic; @@ -40,7 +40,7 @@ class SplitMatrix { int splitClassify(); int splitDistanceLarge(); int splitDistanceRAM(); - int splitNames(vector >& groups); + int splitNames(map& groups, int, vector&); int splitDistanceFileByTax(map&, int); int createDistanceFilesFromTax(map&, int); }; -- 2.39.2