X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=mgclustercommand.cpp;h=3ed9bf2131a45ac7a4b0ac44d959b16bfe28cd36;hb=bdb5d82e2a73829b4e1fa42656ad9bcb57e3e948;hp=6d5acb2386ad0ccd836ee41d2d8f5b24c5e4048e;hpb=aa9238c0a9e6e7aa0ed8b8b606b08ad4fd7dcfe3;p=mothur.git diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index 6d5acb2..3ed9bf2 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -9,6 +9,57 @@ #include "mgclustercommand.h" + +//********************************************************************************************************************** +vector MGClusterCommand::getValidParameters(){ + try { + string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "MGClusterCommand", "getValidParameters"); + exit(1); + } +} +//********************************************************************************************************************** +MGClusterCommand::MGClusterCommand(){ + try { + abort = true; + //initialize outputTypes + vector tempOutNames; + outputTypes["list"] = tempOutNames; + outputTypes["rabund"] = tempOutNames; + outputTypes["sabund"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "MGClusterCommand", "MGClusterCommand"); + exit(1); + } +} +//********************************************************************************************************************** +vector MGClusterCommand::getRequiredParameters(){ + try { + string Array[] = {"blast"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "MGClusterCommand", "getRequiredParameters"); + exit(1); + } +} +//********************************************************************************************************************** +vector MGClusterCommand::getRequiredFiles(){ + try { + vector myArray; + return myArray; + } + catch(exception& e) { + m->errorOut(e, "MGClusterCommand", "getRequiredFiles"); + exit(1); + } +} //********************************************************************************************************************** MGClusterCommand::MGClusterCommand(string option) { try { @@ -20,7 +71,7 @@ MGClusterCommand::MGClusterCommand(string option) { else { //valid paramters for this command - string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"}; + string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -34,6 +85,12 @@ MGClusterCommand::MGClusterCommand(string option) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } + //initialize outputTypes + vector tempOutNames; + outputTypes["list"] = tempOutNames; + outputTypes["rabund"] = tempOutNames; + outputTypes["sabund"] = tempOutNames; + //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } @@ -42,7 +99,7 @@ MGClusterCommand::MGClusterCommand(string option) { it = parameters.find("blast"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["blast"] = inputDir + it->second; } } @@ -50,7 +107,7 @@ MGClusterCommand::MGClusterCommand(string option) { it = parameters.find("name"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } @@ -65,7 +122,7 @@ MGClusterCommand::MGClusterCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; - outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it + outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it } namefile = validParameter.validFile(parameters, "name", true); @@ -97,13 +154,16 @@ MGClusterCommand::MGClusterCommand(string option) { convert(temp, penalty); temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; } - minWanted = isTrue(temp); + minWanted = m->isTrue(temp); temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; } - merge = isTrue(temp); + merge = m->isTrue(temp); temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; } - hclusterWanted = isTrue(temp); + hclusterWanted = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; } + hard = m->isTrue(temp); } } @@ -150,7 +210,7 @@ int MGClusterCommand::execute(){ nameMap->readMap(); }else{ nameMap= new NameAssignment(); } - string fileroot = outputDir + getRootName(getSimpleName(blastfile)); + string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile)); string tag = ""; time_t start; float previousDist = 0.00000; @@ -164,23 +224,26 @@ int MGClusterCommand::execute(){ list = new ListVector(nameMap->getListVector()); RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); - if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; return 0; } + if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; } start = time(NULL); oldList = *list; + map Seq2Bin; + map oldSeq2Bin; if (method == "furthest") { tag = "fn"; } else if (method == "nearest") { tag = "nn"; } else { tag = "an"; } //open output files - openOutputFile(fileroot+ tag + ".list", listFile); - openOutputFile(fileroot+ tag + ".rabund", rabundFile); - openOutputFile(fileroot+ tag + ".sabund", sabundFile); + m->openOutputFile(fileroot+ tag + ".list", listFile); + m->openOutputFile(fileroot+ tag + ".rabund", rabundFile); + m->openOutputFile(fileroot+ tag + ".sabund", sabundFile); if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); + outputTypes.clear(); return 0; } @@ -195,13 +258,16 @@ int MGClusterCommand::execute(){ else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); } else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); } cluster->setMapWanted(true); + Seq2Bin = cluster->getSeqtoBin(); + oldSeq2Bin = Seq2Bin; if (m->control_pressed) { delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); + outputTypes.clear(); return 0; } - + //cluster using cluster classes while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){ @@ -210,11 +276,17 @@ int MGClusterCommand::execute(){ if (m->control_pressed) { delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); + outputTypes.clear(); return 0; } float dist = distMatrix->getSmallDist(); - float rndDist = roundDist(dist, precision); + float rndDist; + if (hard) { + rndDist = m->ceilDist(dist, precision); + }else{ + rndDist = m->roundDist(dist, precision); + } if(previousDist <= 0.0000 && dist != previousDist){ oldList.setLabel("unique"); @@ -222,12 +294,12 @@ int MGClusterCommand::execute(){ } else if(rndDist != rndPreviousDist){ if (merge) { - map seq2Bin = cluster->getSeqtoBin(); - ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist); if (m->control_pressed) { delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); + outputTypes.clear(); return 0; } @@ -239,10 +311,12 @@ int MGClusterCommand::execute(){ printData(&oldList); } } - + previousDist = dist; rndPreviousDist = rndDist; oldList = *list; + Seq2Bin = cluster->getSeqtoBin(); + oldSeq2Bin = Seq2Bin; } if(previousDist <= 0.0000){ @@ -251,12 +325,12 @@ int MGClusterCommand::execute(){ } else if(rndPreviousDist seq2Bin = cluster->getSeqtoBin(); - ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist); if (m->control_pressed) { delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); + outputTypes.clear(); return 0; } @@ -286,20 +360,24 @@ int MGClusterCommand::execute(){ if (m->control_pressed) { delete nameMap; delete list; delete rabund; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); + outputTypes.clear(); return 0; } //create cluster hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff); hcluster->setMapWanted(true); + Seq2Bin = cluster->getSeqtoBin(); + oldSeq2Bin = Seq2Bin; vector seqs; seqs.resize(1); // to start loop //ifstream inHcluster; - //openInputFile(distFile, inHcluster); + //m->openInputFile(distFile, inHcluster); if (m->control_pressed) { delete nameMap; delete list; delete rabund; delete hcluster; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); + outputTypes.clear(); return 0; } @@ -312,6 +390,7 @@ int MGClusterCommand::execute(){ listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); remove(distFile.c_str()); remove(overlapFile.c_str()); + outputTypes.clear(); return 0; } @@ -326,10 +405,16 @@ int MGClusterCommand::execute(){ listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); remove(distFile.c_str()); remove(overlapFile.c_str()); + outputTypes.clear(); return 0; } - float rndDist = roundDist(seqs[i].dist, precision); + float rndDist; + if (hard) { + rndDist = m->ceilDist(seqs[i].dist, precision); + }else{ + rndDist = m->roundDist(seqs[i].dist, precision); + } if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ oldList.setLabel("unique"); @@ -337,14 +422,14 @@ int MGClusterCommand::execute(){ } else if((rndDist != rndPreviousDist)){ if (merge) { - map seq2Bin = hcluster->getSeqtoBin(); - ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist); if (m->control_pressed) { delete nameMap; delete list; delete rabund; delete hcluster; delete temp; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); remove(distFile.c_str()); remove(overlapFile.c_str()); + outputTypes.clear(); return 0; } @@ -360,6 +445,8 @@ int MGClusterCommand::execute(){ previousDist = seqs[i].dist; rndPreviousDist = rndDist; oldList = *list; + Seq2Bin = cluster->getSeqtoBin(); + oldSeq2Bin = Seq2Bin; } } } @@ -371,14 +458,14 @@ int MGClusterCommand::execute(){ } else if(rndPreviousDist seq2Bin = hcluster->getSeqtoBin(); - ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist); if (m->control_pressed) { delete nameMap; delete list; delete rabund; delete hcluster; delete temp; listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); remove(distFile.c_str()); remove(overlapFile.c_str()); + outputTypes.clear(); return 0; } @@ -410,14 +497,15 @@ int MGClusterCommand::execute(){ listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str()); globaldata->setListFile(""); globaldata->setFormat(""); + outputTypes.clear(); return 0; } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine(); - m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); - m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); + m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list"); + m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund"); + m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine(); @@ -452,12 +540,13 @@ ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ try { //create new listvector so you don't overwrite the clustering ListVector* newList = new ListVector(oldList); + bool done = false; ifstream inOverlap; int count = 0; if (hclusterWanted) { - openInputFile(overlapFile, inOverlap); + m->openInputFile(overlapFile, inOverlap); if (inOverlap.eof()) { done = true; } }else { if (overlapMatrix.size() == 0) { done = true; } } @@ -478,15 +567,18 @@ ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ if (!inOverlap.eof()) { string firstName, secondName; float overlapDistance; - inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap); + inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap); - map::iterator itA = nameMap->find(firstName); - map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + //commented out because we check this in readblast already + //map::iterator itA = nameMap->find(firstName); + //map::iterator itB = nameMap->find(secondName); + //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } - overlapNode.seq1 = itA->second; - overlapNode.seq2 = itB->second; + //overlapNode.seq1 = itA->second; + //overlapNode.seq2 = itB->second; + overlapNode.seq1 = nameMap->get(firstName); + overlapNode.seq2 = nameMap->get(secondName); overlapNode.dist = overlapDistance; }else { inOverlap.close(); break; } } @@ -495,16 +587,26 @@ ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ //get names of seqs that overlap string name1 = nameMap->get(overlapNode.seq1); string name2 = nameMap->get(overlapNode.seq2); - + //use binInfo to find out if they are already in the same bin + //map::iterator itBin1 = binInfo.find(name1); + //map::iterator itBin2 = binInfo.find(name2); + + //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); } + //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); } + + //int binKeep = itBin1->second; + //int binRemove = itBin2->second; + int binKeep = binInfo[name1]; int binRemove = binInfo[name2]; - + //if not merge bins and update binInfo if(binKeep != binRemove) { + //save names in old bin - string names = list->get(binRemove); - + string names = newList->get(binRemove); + //merge bins into name1s bin newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep)); newList->set(binRemove, ""); @@ -538,12 +640,12 @@ ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) { try { //sort distFile - string sortedDistFile = sortFile(unsortedDist); + string sortedDistFile = m->sortFile(unsortedDist, outputDir); remove(unsortedDist.c_str()); //delete unsorted file distFile = sortedDistFile; //sort overlap file - string sortedOverlapFile = sortFile(unsortedOverlap); + string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir); remove(unsortedOverlap.c_str()); //delete unsorted file overlapFile = sortedOverlapFile; }