X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=chimerauchimecommand.cpp;h=ce7ce456d87c5c8c6e5a137cd661b02710e0977c;hb=3504e4e2feeb05aabb0c79aa42cb696522030924;hp=2b24fd8ed7ccc3fba78fa08d25c6c77a67506229;hpb=4f2c7f477a1ef2d60a1c0c84ab1ba8243af67f87;p=mothur.git diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index 2b24fd8..ce7ce45 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -12,36 +12,39 @@ //#include "uc.h" #include "sequence.hpp" #include "referencedb.h" - +#include "systemcommand.h" //********************************************************************************************************************** vector ChimeraUchimeCommand::setParameters(){ try { - CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); - CommandParameter pabskew("abskew", "Number", "", "1.9", "", "", "",false,false); parameters.push_back(pabskew); - CommandParameter pchimealns("chimealns", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pchimealns); - CommandParameter pminh("minh", "Number", "", "0.3", "", "", "",false,false); parameters.push_back(pminh); - CommandParameter pmindiv("mindiv", "Number", "", "0.5", "", "", "",false,false); parameters.push_back(pmindiv); - CommandParameter pxn("xn", "Number", "", "8.0", "", "", "",false,false); parameters.push_back(pxn); - CommandParameter pdn("dn", "Number", "", "1.4", "", "", "",false,false); parameters.push_back(pdn); - CommandParameter pxa("xa", "Number", "", "1", "", "", "",false,false); parameters.push_back(pxa); - CommandParameter pchunks("chunks", "Number", "", "4", "", "", "",false,false); parameters.push_back(pchunks); - CommandParameter pminchunk("minchunk", "Number", "", "64", "", "", "",false,false); parameters.push_back(pminchunk); - CommandParameter pidsmoothwindow("idsmoothwindow", "Number", "", "32", "", "", "",false,false); parameters.push_back(pidsmoothwindow); + CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(ptemplate); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","chimera-accnos",false,true,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + CommandParameter pabskew("abskew", "Number", "", "1.9", "", "", "","",false,false); parameters.push_back(pabskew); + CommandParameter pchimealns("chimealns", "Boolean", "", "F", "", "", "","alns",false,false); parameters.push_back(pchimealns); + CommandParameter pminh("minh", "Number", "", "0.3", "", "", "","",false,false); parameters.push_back(pminh); + CommandParameter pmindiv("mindiv", "Number", "", "0.5", "", "", "","",false,false); parameters.push_back(pmindiv); + CommandParameter pxn("xn", "Number", "", "8.0", "", "", "","",false,false); parameters.push_back(pxn); + CommandParameter pdn("dn", "Number", "", "1.4", "", "", "","",false,false); parameters.push_back(pdn); + CommandParameter pxa("xa", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pxa); + CommandParameter pchunks("chunks", "Number", "", "4", "", "", "","",false,false); parameters.push_back(pchunks); + CommandParameter pminchunk("minchunk", "Number", "", "64", "", "", "","",false,false); parameters.push_back(pminchunk); + CommandParameter pidsmoothwindow("idsmoothwindow", "Number", "", "32", "", "", "","",false,false); parameters.push_back(pidsmoothwindow); + CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups); + //CommandParameter pminsmoothid("minsmoothid", "Number", "", "0.95", "", "", "",false,false); parameters.push_back(pminsmoothid); - CommandParameter pmaxp("maxp", "Number", "", "2", "", "", "",false,false); parameters.push_back(pmaxp); - CommandParameter pskipgaps("skipgaps", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pskipgaps); - CommandParameter pskipgaps2("skipgaps2", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pskipgaps2); - CommandParameter pminlen("minlen", "Number", "", "10", "", "", "",false,false); parameters.push_back(pminlen); - CommandParameter pmaxlen("maxlen", "Number", "", "10000", "", "", "",false,false); parameters.push_back(pmaxlen); - CommandParameter pucl("ucl", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pucl); - CommandParameter pqueryfract("queryfract", "Number", "", "0.5", "", "", "",false,false); parameters.push_back(pqueryfract); + CommandParameter pmaxp("maxp", "Number", "", "2", "", "", "","",false,false); parameters.push_back(pmaxp); + CommandParameter pskipgaps("skipgaps", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pskipgaps); + CommandParameter pskipgaps2("skipgaps2", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pskipgaps2); + CommandParameter pminlen("minlen", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pminlen); + CommandParameter pmaxlen("maxlen", "Number", "", "10000", "", "", "","",false,false); parameters.push_back(pmaxlen); + CommandParameter pucl("ucl", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pucl); + CommandParameter pqueryfract("queryfract", "Number", "", "0.5", "", "", "","",false,false); parameters.push_back(pqueryfract); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -58,11 +61,13 @@ string ChimeraUchimeCommand::getHelpString(){ string helpString = ""; helpString += "The chimera.uchime command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; helpString += "This command is a wrapper for uchime written by Robert C. Edgar.\n"; - helpString += "The chimera.uchime command parameters are fasta, name, reference, processors, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n"; + helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dereplicate, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n"; + helpString += "The count parameter allows you to provide a count file, if you are using template=self. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; + helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n"; helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; helpString += "The abskew parameter can only be used with template=self. Minimum abundance skew. Default 1.9. Abundance skew is: min [ abund(parent1), abund(parent2) ] / abund(query).\n"; @@ -98,6 +103,23 @@ string ChimeraUchimeCommand::getHelpString(){ } } //********************************************************************************************************************** +string ChimeraUchimeCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "chimera") { pattern = "[filename],uchime.chimeras"; } + else if (type == "accnos") { pattern = "[filename],uchime.accnos"; } + else if (type == "alns") { pattern = "[filename],uchime.alns"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "ChimeraUchimeCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** ChimeraUchimeCommand::ChimeraUchimeCommand(){ try { abort = true; calledHelp = true; @@ -115,7 +137,7 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(){ //*************************************************************************************************************** ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; hasName=false; hasCount=false; ReferenceDB* rdb = ReferenceDB::getInstance(); //allow user to run help @@ -225,9 +247,8 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { //check for required parameters - bool hasName = true; namefile = validParameter.validFile(parameters, "name", false); - if (namefile == "not found") { namefile = ""; hasName = false; } + if (namefile == "not found") { namefile = ""; } else { m->splitAtDash(namefile, nameFileNames); @@ -294,12 +315,91 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { } } } + } + + if (nameFileNames.size() != 0) { hasName = true; } + + //check for required parameters + vector countfileNames; + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); - //make sure there is at least one valid file left - if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (nameFileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } } - - if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + if (countfileNames.size() != 0) { hasCount = true; } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if (!hasName && hasCount) { nameFileNames = countfileNames; } + + if ((hasCount || hasName) && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } bool hasGroup = true; groupfile = validParameter.validFile(parameters, "group", false); @@ -377,6 +477,10 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -405,6 +509,7 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { } } }else if (hasName) { templatefile = "self"; } + else if (hasCount) { templatefile = "self"; } else { if (rdb->getSavedReference() != "") { templatefile = rdb->getSavedReference(); @@ -418,7 +523,7 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - convert(temp, processors); + m->mothurConvert(temp, processors); abskew = validParameter.validFile(parameters, "abskew", false); if (abskew == "not found"){ useAbskew = false; abskew = "1.9"; }else{ useAbskew = true; } if (useAbskew && templatefile != "self") { m->mothurOut("The abskew parameter is only valid with template=self, ignoring."); m->mothurOutEndLine(); useAbskew = false; } @@ -450,6 +555,15 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { temp = validParameter.validFile(parameters, "skipgaps2", false); if (temp == "not found") { temp = "t"; } skipgaps2 = m->isTrue(temp); + + + temp = validParameter.validFile(parameters, "dereplicate", false); + if (temp == "not found") { + if (groupfile != "") { temp = "false"; } + else { temp = "true"; } + } + dups = m->isTrue(temp); + if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } @@ -461,18 +575,46 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { path = path.substr(0, (tempPath.find_last_of('m'))); string uchimeCommand; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) uchimeCommand = path + "uchime"; // format the database, -o option gives us the ability + if (m->debug) { + m->mothurOut("[DEBUG]: Uchime location using \"which uchime\" = "); + Command* newCommand = new SystemCommand("which uchime"); m->mothurOutEndLine(); + newCommand->execute(); + delete newCommand; + m->mothurOut("[DEBUG]: Mothur's location using \"which mothur\" = "); + newCommand = new SystemCommand("which mothur"); m->mothurOutEndLine(); + newCommand->execute(); + delete newCommand; + } #else uchimeCommand = path + "uchime.exe"; #endif - + //test to make sure uchime exists ifstream in; uchimeCommand = m->getFullPathName(uchimeCommand); int ableToOpen = m->openInputFile(uchimeCommand, in, "no error"); in.close(); - if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + uchimeCommand + " file does not exist. mothur requires the uchime executable."); m->mothurOutEndLine(); abort = true; } - } + if(ableToOpen == 1) { + m->mothurOut(uchimeCommand + " file does not exist. Checking path... \n"); + //check to see if uchime is in the path?? + + string uLocation = m->findProgramPath("uchime"); + + + ifstream in2; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + ableToOpen = m->openInputFile(uLocation, in2, "no error"); in2.close(); +#else + ableToOpen = m->openInputFile((uLocation + ".exe"), in2, "no error"); in2.close(); +#endif + + if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + uLocation + " file does not exist. mothur requires the uchime executable."); m->mothurOutEndLine(); abort = true; } + else { m->mothurOut("Found uchime in your path, using " + uLocation + "\n");uchimeLocation = uLocation; } + }else { uchimeLocation = uchimeCommand; } + + uchimeLocation = m->getFullPathName(uchimeLocation); + } } catch(exception& e) { m->errorOut(e, "ChimeraSlayerCommand", "ChimeraSlayerCommand"); @@ -483,7 +625,8 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { int ChimeraUchimeCommand::execute(){ try{ - if (abort == true) { if (calledHelp) { return 0; } return 2; } + + if (abort == true) { if (calledHelp) { return 0; } return 2; } m->mothurOut("\nuchime by Robert C. Edgar\nhttp://drive5.com/uchime\nThis code is donated to the public domain.\n\n"); @@ -494,28 +637,49 @@ int ChimeraUchimeCommand::execute(){ int start = time(NULL); string nameFile = ""; if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it - string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "uchime.chimera"; - string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "uchime.accnos"; - string alnsFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "uchime.alns"; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])); + string outputFileName = getOutputFileName("chimera", variables); + string accnosFileName = getOutputFileName("accnos", variables); + string alnsFileName = getOutputFileName("alns", variables); string newFasta = m->getRootName(fastaFileNames[s]) + "temp"; //you provided a groupfile string groupFile = ""; - if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; } + bool hasGroup = false; + if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; hasGroup = true; } + else if (hasCount) { + CountTable ct; + if (ct.testGroups(nameFileNames[s])) { hasGroup = true; } + } - if ((templatefile == "self") && (groupFile == "")) { //you want to run uchime with a reference template + if ((templatefile == "self") && (!hasGroup)) { //you want to run uchime with a template=self and no groups if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one nameFile = nameFileNames[s]; }else { nameFile = getNamesFile(fastaFileNames[s]); } - + map seqs; readFasta(fastaFileNames[s], seqs); if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } //read namefile vector nameMapCount; - int error = m->readNames(nameFile, nameMapCount, seqs); if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + int error; + if (hasCount) { + CountTable ct; + ct.readTable(nameFile); + for(map::iterator it = seqs.begin(); it != seqs.end(); it++) { + int num = ct.getNumSeqs(it->first); + if (num == 0) { error = 1; } + else { + seqPriorityNode temp(num, it->second, it->first); + nameMapCount.push_back(temp); + } + } + }else { + error = m->readNames(nameFile, nameMapCount, seqs); if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + } if (error == 1) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } @@ -525,14 +689,23 @@ int ChimeraUchimeCommand::execute(){ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - if (groupFile != "") { + if (hasGroup) { if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one nameFile = nameFileNames[s]; }else { nameFile = getNamesFile(fastaFileNames[s]); } //Parse sequences by group - SequenceParser parser(groupFile, fastaFileNames[s], nameFile); - vector groups = parser.getNamesOfGroups(); + vector groups; + map uniqueNames; + if (hasCount) { + cparser = new SequenceCountParser(nameFile, fastaFileNames[s]); + groups = cparser->getNamesOfGroups(); + uniqueNames = cparser->getAllSeqsMap(); + }else{ + sparser = new SequenceParser(groupFile, fastaFileNames[s], nameFile); + groups = sparser->getNamesOfGroups(); + uniqueNames = sparser->getAllSeqsMap(); + } if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } @@ -543,19 +716,20 @@ int ChimeraUchimeCommand::execute(){ if (chimealns) { m->openOutputFile(alnsFileName, out2); out2.close(); } int totalSeqs = 0; - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1) { totalSeqs = driverGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups); } - else { totalSeqs = createProcessesGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, groups); } - #else - totalSeqs = driverGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups); - #endif - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + if(processors == 1) { totalSeqs = driverGroups(outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups); } + else { totalSeqs = createProcessesGroups(outputFileName, newFasta, accnosFileName, alnsFileName, groups, nameFile, groupFile, fastaFileNames[s]); } - int totalChimeras = deconvoluteResults(parser, outputFileName, accnosFileName, alnsFileName); - - m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found."); m->mothurOutEndLine(); - m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + if (hasCount) { delete cparser; } + else { delete sparser; } + + if (!dups) { + int totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, alnsFileName); + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found."); m->mothurOutEndLine(); + m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + } + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } }else{ @@ -563,12 +737,19 @@ int ChimeraUchimeCommand::execute(){ int numSeqs = 0; int numChimeras = 0; - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ numSeqs = driver(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); } else{ numSeqs = createProcesses(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); } - #else - numSeqs = driver(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); - #endif + + //add headings + ofstream out; + m->openOutputFile(outputFileName+".temp", out); + out << "Score\tQuery\tParentA\tParentB\tIdQM\tIdQA\tIdQB\tIdAB\tIdQT\tLY\tLN\tLA\tRY\tRN\tRA\tDiv\tYN\n"; + out.close(); + + m->appendFiles(outputFileName, outputFileName+".temp"); + m->mothurRemove(outputFileName); rename((outputFileName+".temp").c_str(), outputFileName.c_str()); + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } //remove file made for uchime @@ -603,9 +784,8 @@ int ChimeraUchimeCommand::execute(){ } } //********************************************************************************************************************** -int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName, string alnsFileName){ +int ChimeraUchimeCommand::deconvoluteResults(map& uniqueNames, string outputFileName, string accnosFileName, string alnsFileName){ try { - map uniqueNames = parser.getAllSeqsMap(); map::iterator itUnique; int total = 0; @@ -631,7 +811,7 @@ int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outp //find unique name itUnique = uniqueNames.find(name); - if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; } + if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find " + name + "."); m->mothurOutEndLine(); m->control_pressed = true; } else { itChimeras = chimerasInFile.find((itUnique->second)); @@ -656,6 +836,7 @@ int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outp ofstream out; m->openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); + out << "Score\tQuery\tParentA\tParentB\tIdQM\tIdQA\tIdQB\tIdAB\tIdQT\tLY\tLN\tLA\tRY\tRN\tRA\tDiv\tYN\n"; float temp1; string parent1, parent2, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13, flag; @@ -922,14 +1103,15 @@ string ChimeraUchimeCommand::getNamesFile(string& inputFile){ string inputString = "fasta=" + inputFile; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - + m->mothurCalling = true; + Command* uniqueCommand = new DeconvoluteCommand(inputString); uniqueCommand->execute(); map > filenames = uniqueCommand->getOutputFiles(); delete uniqueCommand; - + m->mothurCalling = false; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); nameFile = filenames["name"][0]; @@ -943,7 +1125,7 @@ string ChimeraUchimeCommand::getNamesFile(string& inputFile){ } } //********************************************************************************************************************** -int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, int start, int end, vector groups){ +int ChimeraUchimeCommand::driverGroups(string outputFName, string filename, string accnos, string alns, int start, int end, vector groups){ try { int totalSeqs = 0; @@ -951,8 +1133,10 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam for (int i = start; i < end; i++) { int start = time(NULL); if (m->control_pressed) { return 0; } - - int error = parser.getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) { return 0; } + + int error; + if (hasCount) { error = cparser->getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) { return 0; } } + else { error = sparser->getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) { return 0; } } int numSeqs = driver((outputFName + groups[i]), filename, (accnos+ groups[i]), (alns+ groups[i]), numChimeras); totalSeqs += numSeqs; @@ -960,7 +1144,8 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam if (m->control_pressed) { return 0; } //remove file made for uchime - m->mothurRemove(filename); + if (!m->debug) { m->mothurRemove(filename); } + else { m->mothurOut("[DEBUG]: saving file: " + filename + ".\n"); } //append files m->appendFiles((outputFName+groups[i]), outputFName); m->mothurRemove((outputFName+groups[i])); @@ -969,7 +1154,6 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + groups[i] + "."); m->mothurOutEndLine(); } - return totalSeqs; } @@ -982,36 +1166,33 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam int ChimeraUchimeCommand::driver(string outputFName, string filename, string accnos, string alns, int& numChimeras){ try { + + outputFName = m->getFullPathName(outputFName); + filename = m->getFullPathName(filename); + alns = m->getFullPathName(alns); + //to allow for spaces in the path outputFName = "\"" + outputFName + "\""; filename = "\"" + filename + "\""; alns = "\"" + alns + "\""; vector cPara; - - char* tempUchime; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - tempUchime= new char[10]; - *tempUchime = '\0'; - strncat(tempUchime, "./uchime ", 9); -#else - tempUchime= new char[8]; + + string uchimeCommand = uchimeLocation; + uchimeCommand = "\"" + uchimeCommand + "\" "; + + char* tempUchime; + tempUchime= new char[uchimeCommand.length()+1]; *tempUchime = '\0'; - strncat(tempUchime, "uchime ", 7); -#endif + strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length()); cPara.push_back(tempUchime); - char* tempIn = new char[8]; - *tempIn = '\0'; strncat(tempIn, "--input", 7); - //strcpy(tempIn, "--input"); - cPara.push_back(tempIn); - char* temp = new char[filename.length()+1]; - *temp = '\0'; strncat(temp, filename.c_str(), filename.length()); - //strcpy(temp, filename.c_str()); - cPara.push_back(temp); - - //are you using a reference file + //are you using a reference file if (templatefile != "self") { + string outputFileName = filename.substr(1, filename.length()-2) + ".uchime_formatted"; + prepFile(filename.substr(1, filename.length()-2), outputFileName); + filename = outputFileName; + filename = "\"" + filename + "\""; //add reference file char* tempRef = new char[5]; //strcpy(tempRef, "--db"); @@ -1023,6 +1204,15 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc cPara.push_back(tempR); } + char* tempIn = new char[8]; + *tempIn = '\0'; strncat(tempIn, "--input", 7); + //strcpy(tempIn, "--input"); + cPara.push_back(tempIn); + char* temp = new char[filename.length()+1]; + *temp = '\0'; strncat(temp, filename.c_str(), filename.length()); + //strcpy(temp, filename.c_str()); + cPara.push_back(temp); + char* tempO = new char[12]; *tempO = '\0'; strncat(tempO, "--uchimeout", 11); //strcpy(tempO, "--uchimeout"); @@ -1226,6 +1416,11 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc //uchime_main(numArgs, uchimeParameters); //cout << "commandString = " << commandString << endl; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else + commandString = "\"" + commandString + "\""; +#endif + if (m->debug) { m->mothurOut("[DEBUG]: uchime command = " + commandString + ".\n"); } system(commandString.c_str()); //free memory @@ -1254,15 +1449,21 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc string name = ""; string chimeraFlag = ""; - in >> chimeraFlag >> name; + //in >> chimeraFlag >> name; - //fix name if needed - if (templatefile == "self") { - name = name.substr(0, name.length()-1); //rip off last / - name = name.substr(0, name.find_last_of('/')); + string line = m->getline(in); + vector pieces = m->splitWhiteSpace(line); + if (pieces.size() > 2) { + name = pieces[1]; + //fix name if needed + if (templatefile == "self") { + name = name.substr(0, name.length()-1); //rip off last / + name = name.substr(0, name.find_last_of('/')); + } + + chimeraFlag = pieces[pieces.size()-1]; } - - for (int i = 0; i < 15; i++) { in >> chimeraFlag; } + //for (int i = 0; i < 15; i++) { in >> chimeraFlag; } m->gobble(in); if (chimeraFlag == "Y") { out << name << endl; numChimeras++; } @@ -1271,6 +1472,8 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc in.close(); out.close(); + //if (templatefile != "self") { m->mothurRemove(filename); } + return num; } catch(exception& e) { @@ -1279,6 +1482,34 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc } } /**************************************************************************************************/ +//uchime can't handle some of the things allowed in mothurs fasta files. This functions "cleans up" the file. +int ChimeraUchimeCommand::prepFile(string filename, string output) { + try { + + ifstream in; + m->openInputFile(filename, in); + + ofstream out; + m->openOutputFile(output, out); + + while (!in.eof()) { + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { seq.printSequence(out); } + } + in.close(); + out.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraUchimeCommand", "prepFile"); + exit(1); + } +} +/**************************************************************************************************/ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename, string accnos, string alns, int& numChimeras) { try { @@ -1286,9 +1517,10 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename processIDS.clear(); int process = 1; int num = 0; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //break up file into multiple files vector files; + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + //break up file into multiple files m->divideFile(filename, processors, files); if (m->control_pressed) { return 0; } @@ -1341,10 +1573,92 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename } in.close(); m->mothurRemove(tempFile); } +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the preClusterData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + //divide file + int count = 0; + int spot = 0; + map filehandles; + map::iterator it3; + + ofstream* temp; + for (int i = 0; i < processors; i++) { + temp = new ofstream; + filehandles[i] = temp; + m->openOutputFile(filename+toString(i)+".temp", *(temp)); + files.push_back(filename+toString(i)+".temp"); + } + ifstream in; + m->openInputFile(filename, in); + + while(!in.eof()) { + + if (m->control_pressed) { in.close(); for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { (*(it3->second)).close(); delete it3->second; } return 0; } + + Sequence tempSeq(in); m->gobble(in); + + if (tempSeq.getName() != "") { + tempSeq.printSequence(*(filehandles[spot])); + spot++; count++; + if (spot == processors) { spot = 0; } + } + } + in.close(); + + //delete memory + for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { + (*(it3->second)).close(); + delete it3->second; + } + + //sanity check for number of processors + if (count < processors) { processors = count; } + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + vector dummy; //used so that we can use the same struct for MyUchimeSeqsThreadFunction and MyUchimeThreadFunction + + //Create processor worker threads. + for( int i=1; isetBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount); + tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract); + + pDataArray.push_back(tempUchime); + processIDS.push_back(i); + + //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. + //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier + hThreadArray[i-1] = CreateThread(NULL, 0, MyUchimeSeqsThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]); + } + + + //using the main process as a worker saves time and memory + num = driver(outputFileName, files[0], accnos, alns, numChimeras); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + num += pDataArray[i]->count; + numChimeras += pDataArray[i]->numChimeras; + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } +#endif //append output files - for(int i=0;iappendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName); m->mothurRemove((outputFileName + toString(processIDS[i]) + ".temp")); @@ -1359,7 +1673,6 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename //get rid of the file pieces. for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); } -#endif return num; } catch(exception& e) { @@ -1369,7 +1682,7 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename } /**************************************************************************************************/ -int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, vector groups) { +int ChimeraUchimeCommand::createProcessesGroups(string outputFName, string filename, string accnos, string alns, vector groups, string nameFile, string groupFile, string fastaFile) { try { processIDS.clear(); @@ -1389,7 +1702,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o lines.push_back(linePair(startIndex, endIndex)); } -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //loop through and create all the processes you want while (process != processors) { @@ -1399,7 +1712,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", filename + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", alns + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); + num = driverGroups(outputFName + toString(getpid()) + ".temp", filename + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", alns + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); //pass numSeqs to parent ofstream out; @@ -1417,14 +1730,13 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o } //do my part - num = driverGroups(parser, outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups); + num = driverGroups(outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups); //force parent to wait until all the processes are done for (int i=0;i> tempNum; num += tempNum; } in.close(); m->mothurRemove(tempFile); } + +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the uchimeData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=1; isetBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount); + tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract); + + pDataArray.push_back(tempUchime); + processIDS.push_back(i); + + //MyUchimeThreadFunction is in header. It must be global or static to work with the threads. + //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier + hThreadArray[i-1] = CreateThread(NULL, 0, MyUchimeThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]); + } + + + //using the main process as a worker saves time and memory + num = driverGroups(outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups); + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + num += pDataArray[i]->count; + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } +#endif + + //append output files - for(int i=0;iappendFiles((outputFName + toString(processIDS[i]) + ".temp"), outputFName); m->mothurRemove((outputFName + toString(processIDS[i]) + ".temp"));