X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=chimeraslayercommand.cpp;h=f01bd3ee9d9bb3611b52b38819f98300da9cff39;hp=629eca965b3ca1c084c3d837b6a2870ea8fa43ec;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=8c8acb6218f58f662466e4111ab8aa4da0caf93c diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 629eca9..f01bd3e 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -11,36 +11,40 @@ #include "deconvolutecommand.h" #include "referencedb.h" #include "sequenceparser.h" +#include "counttable.h" //********************************************************************************************************************** vector ChimeraSlayerCommand::setParameters(){ try { - CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pwindow("window", "Number", "", "50", "", "", "",false,false); parameters.push_back(pwindow); - CommandParameter pksize("ksize", "Number", "", "7", "", "", "",false,false); parameters.push_back(pksize); - CommandParameter pmatch("match", "Number", "", "5.0", "", "", "",false,false); parameters.push_back(pmatch); - CommandParameter pmismatch("mismatch", "Number", "", "-4.0", "", "", "",false,false); parameters.push_back(pmismatch); - CommandParameter pminsim("minsim", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminsim); - CommandParameter pmincov("mincov", "Number", "", "70", "", "", "",false,false); parameters.push_back(pmincov); - CommandParameter pminsnp("minsnp", "Number", "", "10", "", "", "",false,false); parameters.push_back(pminsnp); - CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs); - CommandParameter psearch("search", "Multiple", "kmer-blast", "blast", "", "", "",false,false); parameters.push_back(psearch); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter prealign("realign", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(prealign); - CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim); - CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit); - CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted); - CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); - CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "",false,false); parameters.push_back(pdivergence); - CommandParameter pparents("parents", "Number", "", "3", "", "", "",false,false); parameters.push_back(pparents); - CommandParameter pincrement("increment", "Number", "", "5", "", "", "",false,false); parameters.push_back(pincrement); - CommandParameter pblastlocation("blastlocation", "String", "", "", "", "", "",false,false); parameters.push_back(pblastlocation); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); - CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); + CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(ptemplate); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","chimera-accnos",false,true,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup); + CommandParameter pwindow("window", "Number", "", "50", "", "", "","",false,false); parameters.push_back(pwindow); + CommandParameter pksize("ksize", "Number", "", "7", "", "", "","",false,false); parameters.push_back(pksize); + CommandParameter pmatch("match", "Number", "", "5.0", "", "", "","",false,false); parameters.push_back(pmatch); + CommandParameter pmismatch("mismatch", "Number", "", "-4.0", "", "", "","",false,false); parameters.push_back(pmismatch); + CommandParameter pminsim("minsim", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pminsim); + CommandParameter pmincov("mincov", "Number", "", "70", "", "", "","",false,false); parameters.push_back(pmincov); + CommandParameter pminsnp("minsnp", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pminsnp); + CommandParameter pminbs("minbs", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pminbs); + CommandParameter psearch("search", "Multiple", "kmer-blast", "blast", "", "", "","",false,false); parameters.push_back(psearch); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + + CommandParameter prealign("realign", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(prealign); + CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "","fasta",false,false); parameters.push_back(ptrim); + CommandParameter psplit("split", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psplit); + CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "","",false,false); parameters.push_back(pnumwanted); + CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters); + CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "","",false,false); parameters.push_back(pdivergence); + CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups); + CommandParameter pparents("parents", "Number", "", "3", "", "", "","",false,false); parameters.push_back(pparents); + CommandParameter pincrement("increment", "Number", "", "5", "", "", "","",false,false); parameters.push_back(pincrement); + CommandParameter pblastlocation("blastlocation", "String", "", "", "", "", "","",false,false); parameters.push_back(pblastlocation); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -57,16 +61,18 @@ string ChimeraSlayerCommand::getHelpString(){ string helpString = ""; helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n"; - helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n"; + helpString += "The chimera.slayer command parameters are fasta, name, group, template, processors, dereplicate, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file, if you are using reference=self. \n"; helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; + helpString += "The count parameter allows you to provide a count file. The count file reference=self. If your count file contains group information, when checking sequences, only sequences from the same group as the query sequence will be used as the reference. When you use a count file with group info and dereplicate=T, mothur will create a *.pick.count_table file containing seqeunces after chimeras are removed. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; #ifdef USE_MPI helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n"; #endif + helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n"; helpString += "The trim parameter allows you to output a new fasta file containing your sequences with the chimeric ones trimmed to include only their longest piece, default=F. \n"; helpString += "The split parameter allows you to check both pieces of non-chimeric sequence for chimeras, thus looking for trimeras and quadmeras. default=F. \n"; helpString += "The window parameter allows you to specify the window size for searching for chimeras, default=50. \n"; @@ -98,6 +104,24 @@ string ChimeraSlayerCommand::getHelpString(){ } } //********************************************************************************************************************** +string ChimeraSlayerCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "chimera") { pattern = "[filename],slayer.chimeras"; } + else if (type == "accnos") { pattern = "[filename],slayer.accnos"; } + else if (type == "fasta") { pattern = "[filename],slayer.fasta"; } + else if (type == "count") { pattern = "[filename],slayer.pick.count_table"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** ChimeraSlayerCommand::ChimeraSlayerCommand(){ try { abort = true; calledHelp = true; @@ -106,6 +130,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(){ outputTypes["chimera"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ChimeraSlayerCommand", "ChimeraSlayerCommand"); @@ -117,6 +142,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { try { abort = false; calledHelp = false; ReferenceDB* rdb = ReferenceDB::getInstance(); + hasCount = false; + hasName = false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -140,6 +167,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { outputTypes["chimera"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -225,9 +253,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { //check for required parameters - bool hasName = true; namefile = validParameter.validFile(parameters, "name", false); - if (namefile == "not found") { namefile = ""; hasName = false; } + if (namefile == "not found") { namefile = ""; } else { m->splitAtDash(namefile, nameFileNames); @@ -294,12 +321,91 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { } } } + } + + if (nameFileNames.size() != 0) { hasName = true; } + + //check for required parameters + vector countfileNames; + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); - //make sure there is at least one valid file left - if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (nameFileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } } - - if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + if (countfileNames.size() != 0) { hasCount = true; } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if (!hasName && hasCount) { nameFileNames = countfileNames; } + + if ((hasCount || hasName) && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } bool hasGroup = true; groupfile = validParameter.validFile(parameters, "group", false); @@ -377,13 +483,13 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } - + if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - convert(temp, processors); + m->mothurConvert(temp, processors); temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } save = m->isTrue(temp); @@ -427,6 +533,12 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { m->mothurOutEndLine(); save = false; } + }else if (hasCount) { templatefile = "self"; + if (save) { + m->mothurOut("[WARNING]: You can't save reference=self, ignoring save."); + m->mothurOutEndLine(); + save = false; + } } else { if (rdb->referenceSeqs.size() != 0) { @@ -441,34 +553,34 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; } - convert(temp, ksize); + m->mothurConvert(temp, ksize); temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "50"; } - convert(temp, window); + m->mothurConvert(temp, window); temp = validParameter.validFile(parameters, "match", false); if (temp == "not found") { temp = "5"; } - convert(temp, match); + m->mothurConvert(temp, match); temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found") { temp = "-4"; } - convert(temp, mismatch); + m->mothurConvert(temp, mismatch); temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.007"; } - convert(temp, divR); + m->mothurConvert(temp, divR); temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "90"; } - convert(temp, minSimilarity); + m->mothurConvert(temp, minSimilarity); temp = validParameter.validFile(parameters, "mincov", false); if (temp == "not found") { temp = "70"; } - convert(temp, minCoverage); + m->mothurConvert(temp, minCoverage); temp = validParameter.validFile(parameters, "minbs", false); if (temp == "not found") { temp = "90"; } - convert(temp, minBS); + m->mothurConvert(temp, minBS); temp = validParameter.validFile(parameters, "minsnp", false); if (temp == "not found") { temp = "10"; } - convert(temp, minSNP); + m->mothurConvert(temp, minSNP); temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "3"; } - convert(temp, parents); + m->mothurConvert(temp, parents); temp = validParameter.validFile(parameters, "realign", false); if (temp == "not found") { temp = "t"; } realign = m->isTrue(temp); @@ -482,27 +594,31 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { search = validParameter.validFile(parameters, "search", false); if (search == "not found") { search = "blast"; } temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } - convert(temp, iters); + m->mothurConvert(temp, iters); temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "5"; } - convert(temp, increment); + m->mothurConvert(temp, increment); temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "15"; } - convert(temp, numwanted); + m->mothurConvert(temp, numwanted); + + temp = validParameter.validFile(parameters, "dereplicate", false); + if (temp == "not found") { temp = "false"; } + dups = m->isTrue(temp); blastlocation = validParameter.validFile(parameters, "blastlocation", false); if (blastlocation == "not found") { blastlocation = ""; } else { //add / to name if needed string lastChar = blastlocation.substr(blastlocation.length()-1); -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if (lastChar != "/") { blastlocation += "/"; } #else if (lastChar != "\\") { blastlocation += "\\"; } #endif blastlocation = m->getFullPathName(blastlocation); string formatdbCommand = ""; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) formatdbCommand = blastlocation + "formatdb"; #else formatdbCommand = blastlocation + "formatdb.exe"; @@ -515,7 +631,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe to run chimera.slayer."); m->mothurOutEndLine(); abort = true; } string blastCommand = ""; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) blastCommand = blastlocation + "megablast"; #else blastCommand = blastlocation + "megablast.exe"; @@ -529,9 +645,14 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { if ((search != "blast") && (search != "kmer")) { m->mothurOut(search + " is not a valid search."); m->mothurOutEndLine(); abort = true; } - if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } + if ((hasName || hasCount) && (templatefile != "self")) { m->mothurOut("You have provided a namefile or countfile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } + //until we resolve the issue 10-18-11 +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else + //processors=1; +#endif } } catch(exception& e) { @@ -544,16 +665,28 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { int ChimeraSlayerCommand::execute(){ try{ if (abort == true) { if (calledHelp) { return 0; } return 2; } - + for (int s = 0; s < fastaFileNames.size(); s++) { m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); int start = time(NULL); + if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])); + string outputFileName = getOutputFileName("chimera", variables); + string accnosFileName = getOutputFileName("accnos", variables); + string trimFastaFileName = getOutputFileName("fasta", variables); + string newCountFile = ""; - //you provided a groupfile - string groupFile = ""; - if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; } + //clears files + ofstream out, out1, out2; + m->openOutputFile(outputFileName, out); out.close(); + m->openOutputFile(accnosFileName, out1); out1.close(); + if (trim) { m->openOutputFile(trimFastaFileName, out2); out2.close(); } + outputNames.push_back(outputFileName); outputTypes["chimera"].push_back(outputFileName); + outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName); + if (trim) { outputNames.push_back(trimFastaFileName); outputTypes["fasta"].push_back(trimFastaFileName); } //maps a filename to priority map. //if no groupfile this is fastafileNames[s] -> prioirity @@ -561,143 +694,130 @@ int ChimeraSlayerCommand::execute(){ map > fileToPriority; map >::iterator itFile; map fileGroup; - map priority; fileToPriority[fastaFileNames[s]] = priority; //default fileGroup[fastaFileNames[s]] = "noGroup"; - SequenceParser* parser = NULL; - int totalSeqs = 0; + map uniqueNames; int totalChimeras = 0; - - if ((templatefile == "self") && (groupFile == "")) { - fileGroup.clear(); - fileToPriority.clear(); - if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } - string nameFile = ""; - if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one - nameFile = nameFileNames[s]; - }else { nameFile = getNamesFile(fastaFileNames[s]); } - - //sort fastafile by abundance, returns new sorted fastafile name - m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); - priority = sortFastaFile(fastaFileNames[s], nameFile); - m->mothurOut("Done."); m->mothurOutEndLine(); - - fileToPriority[fastaFileNames[s]] = priority; - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - }else if ((templatefile == "self") && (groupFile != "")) { - fileGroup.clear(); - fileToPriority.clear(); - string nameFile = ""; - if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one - nameFile = nameFileNames[s]; - }else { nameFile = getNamesFile(fastaFileNames[s]); } - - //Parse sequences by group - parser = new SequenceParser(groupFile, fastaFileNames[s], nameFile); - vector groups = parser->getNamesOfGroups(); - - for (int i = 0; i < groups.size(); i++) { - vector thisGroupsSeqs = parser->getSeqs(groups[i]); - map thisGroupsMap = parser->getNameMap(groups[i]); - string newFastaFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + groups[i] + "-sortedTemp.fasta"; - priority = sortFastaFile(thisGroupsSeqs, thisGroupsMap, newFastaFile); - fileToPriority[newFastaFile] = priority; - fileGroup[newFastaFile] = groups[i]; - } - } - - if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it - string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.chimera"; - string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.accnos"; - string trimFastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.fasta"; - - //clears files - ofstream out, out1, out2; - m->openOutputFile(outputFileName, out); out.close(); - m->openOutputFile(accnosFileName, out1); out1.close(); - if (trim) { m->openOutputFile(trimFastaFileName, out2); out2.close(); } - outputNames.push_back(outputFileName); outputTypes["chimera"].push_back(outputFileName); - outputNames.push_back(accnosFileName); outputTypes["accnos"].push_back(accnosFileName); - if (trim) { outputNames.push_back(trimFastaFileName); outputTypes["fasta"].push_back(trimFastaFileName); } - - - for (itFile = fileToPriority.begin(); itFile != fileToPriority.end(); itFile++) { - + lines.clear(); + + if (templatefile == "self") { + if (hasCount) { + SequenceCountParser* parser = NULL; + setUpForSelfReference(parser, fileGroup, fileToPriority, s); + if (parser != NULL) { uniqueNames = parser->getAllSeqsMap(); delete parser; } + }else { + SequenceParser* parser = NULL; + setUpForSelfReference(parser, fileGroup, fileToPriority, s); + if (parser != NULL) { uniqueNames = parser->getAllSeqsMap(); delete parser; } + } + } + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + if (fileToPriority.size() == 1) { //you running without a groupfile + itFile = fileToPriority.begin(); string thisFastaName = itFile->first; map thisPriority = itFile->second; - - //this is true when you have parsed by groups - if (fileToPriority.size() > 1) { m->mothurOutEndLine(); m->mothurOut("Checking sequences from group: " + fileGroup[thisFastaName] + "."); m->mothurOutEndLine(); } - - string thisoutputFileName = outputDir + m->getRootName(m->getSimpleName(thisFastaName)) + fileGroup[thisFastaName] + "slayer.chimera"; - string thisaccnosFileName = outputDir + m->getRootName(m->getSimpleName(thisFastaName)) + fileGroup[thisFastaName] + "slayer.accnos"; - string thistrimFastaFileName = outputDir + m->getRootName(m->getSimpleName(thisFastaName)) + fileGroup[thisFastaName] + "slayer.fasta"; - - //create chimera here if you are mac or linux because fork will copy for you. Create in create processes instead if you are windows. - if (processors == 1) { templateSeqsLength = setupChimera(thisFastaName, thisPriority); } +#ifdef USE_MPI + MPIExecute(thisFastaName, outputFileName, accnosFileName, trimFastaFileName, thisPriority); +#else + //break up file + vector positions; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + positions = m->divideFile(thisFastaName, processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } +#else + if (processors == 1) { lines.push_back(linePair(0, 1000)); } else { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || USE_MPI - templateSeqsLength = setupChimera(thisFastaName, thisPriority); - #endif - } - - if (m->control_pressed) { if (parser != NULL) { delete parser; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - #ifdef USE_MPI - MPIExecute(thisFastaName, thisoutputFileName, thisaccnosFileName, thistrimFastaFileName); - if (m->control_pressed) { outputTypes.clear(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - #else - //break up file - vector positions; - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - positions = m->divideFile(thisFastaName, processors); - for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } - #else - if (processors == 1) { lines.push_back(new linePair(0, 1000)); } - else { - positions = m->setFilePosFasta(thisFastaName, numSeqs); + positions = m->setFilePosFasta(thisFastaName, numSeqs); + if (positions.size() < processors) { processors = positions.size(); } - //figure out how many sequences you have to process - int numSeqsPerProcessor = numSeqs / processors; - for (int i = 0; i < processors; i++) { - int startIndex = i * numSeqsPerProcessor; - if(i == (processors - 1)){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } - lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); - } + //figure out how many sequences you have to process + int numSeqsPerProcessor = numSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); } - #endif - - if(processors == 1){ - numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName); - - int numNoParents = chimera->getNumNoParents(); - if (numNoParents == numSeqs) { m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors."); m->mothurOutEndLine(); } - - }else{ numSeqs = createProcesses(thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName); } - - if (m->control_pressed) { if (parser != NULL) { delete parser; } outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; } - - #endif - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || USE_MPI - delete chimera; - #endif - - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + } +#endif + if(processors == 1){ numSeqs = driver(lines[0], outputFileName, thisFastaName, accnosFileName, trimFastaFileName, thisPriority); } + else{ numSeqs = createProcesses(outputFileName, thisFastaName, accnosFileName, trimFastaFileName, thisPriority); } - //append files - m->appendFiles(thisoutputFileName, outputFileName); m->mothurRemove(thisoutputFileName); - totalChimeras = m->appendFiles(thisaccnosFileName, accnosFileName); m->mothurRemove(thisaccnosFileName); - if (trim) { m->appendFiles(thistrimFastaFileName, trimFastaFileName); m->mothurRemove(thistrimFastaFileName); } + if (m->control_pressed) { outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } +#endif + }else { //you have provided a groupfile + string countFile = ""; + if (hasCount) { + countFile = nameFileNames[s]; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFileNames[s])); + newCountFile = getOutputFileName("count", variables); + } +#ifdef USE_MPI + MPIExecuteGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile, countFile); +#else + if (processors == 1) { + numSeqs = driverGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile); + if (hasCount && dups) { + CountTable c; c.readTable(nameFileNames[s], true); + if (!m->isBlank(newCountFile)) { + ifstream in2; + m->openInputFile(newCountFile, in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + c.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(newCountFile); + c.printTable(newCountFile); + } + + } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile, countFile); } //destroys fileToPriority +#endif + +#ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - totalSeqs += numSeqs; + if (pid == 0) { +#endif + if (!dups) { + totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName); + m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine(); + }else { + if (hasCount) { + set doNotRemove; + CountTable c; c.readTable(newCountFile, true); + vector namesInTable = c.getNamesOfSeqs(); + for (int i = 0; i < namesInTable.size(); i++) { + int temp = c.getNumSeqs(namesInTable[i]); + if (temp == 0) { c.remove(namesInTable[i]); } + else { doNotRemove.insert((namesInTable[i])); } + } + //remove names we want to keep from accnos file. + set accnosNames = m->readAccnos(accnosFileName); + ofstream out2; + m->openOutputFile(accnosFileName, out2); + for (set::iterator it = accnosNames.begin(); it != accnosNames.end(); it++) { + if (doNotRemove.count(*it) == 0) { out2 << (*it) << endl; } + } + out2.close(); + c.printTable(newCountFile); + outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); + } + } + +#ifdef USE_MPI + } + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait +#endif } - if (fileToPriority.size() > 1) { totalChimeras = deconvoluteResults(parser, outputFileName, accnosFileName, trimFastaFileName); } - - if (parser != NULL) { delete parser; } - - m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences."); m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); } //set accnos file as new current accnosfile @@ -714,6 +834,11 @@ int ChimeraSlayerCommand::execute(){ } } + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } @@ -728,7 +853,162 @@ int ChimeraSlayerCommand::execute(){ } } //********************************************************************************************************************** -int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, string accnosFileName, string trimFastaFileName){ +int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosFileName, string trimFastaFileName, map >& fileToPriority, map& fileGroup, string countlist, string countfile){ + try { +#ifdef USE_MPI + int pid; + int tag = 2001; + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + //put filenames in a vector, then pass each process a starting and ending point in the vector + //all processes already have the fileToPriority and fileGroup, they just need to know which files to process + map >::iterator itFile; + vector filenames; + for(itFile = fileToPriority.begin(); itFile != fileToPriority.end(); itFile++) { filenames.push_back(itFile->first); } + + int numGroupsPerProcessor = filenames.size() / processors; + int startIndex = pid * numGroupsPerProcessor; + int endIndex = (pid+1) * numGroupsPerProcessor; + if(pid == (processors - 1)){ endIndex = filenames.size(); } + + vector MPIPos; + + MPI_File outMPI; + MPI_File outMPIAccnos; + MPI_File outMPIFasta; + MPI_File outMPICount; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + int inMode=MPI_MODE_RDONLY; + + char outFilename[1024]; + strcpy(outFilename, outputFileName.c_str()); + + char outAccnosFilename[1024]; + strcpy(outAccnosFilename, accnosFileName.c_str()); + + char outFastaFilename[1024]; + strcpy(outFastaFilename, trimFastaFileName.c_str()); + + char outCountFilename[1024]; + strcpy(outCountFilename, countlist.c_str()); + + MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI); + MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); + if (trim) { MPI_File_open(MPI_COMM_WORLD, outFastaFilename, outMode, MPI_INFO_NULL, &outMPIFasta); } + if (hasCount && dups) { MPI_File_open(MPI_COMM_WORLD, outCountFilename, outMode, MPI_INFO_NULL, &outMPICount); } + + if (m->control_pressed) { MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); if (hasCount && dups) { MPI_File_close(&outMPICount); } return 0; } + + //print headers + if (pid == 0) { //you are the root process + m->mothurOutEndLine(); + m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results."); + m->mothurOutEndLine(); + + string outTemp = "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; + + //print header + int length = outTemp.length(); + char* buf2 = new char[length]; + memcpy(buf2, outTemp.c_str(), length); + + MPI_File_write_shared(outMPI, buf2, length, MPI_CHAR, &status); + delete buf2; + } + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait + + for (int i = startIndex; i < endIndex; i++) { + + int start = time(NULL); + int num = 0; + string thisFastaName = filenames[i]; + map thisPriority = fileToPriority[thisFastaName]; + + char inFileName[1024]; + strcpy(inFileName, thisFastaName.c_str()); + MPI_File inMPI; + MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + MPIPos = m->setFilePosFasta(thisFastaName, num); //fills MPIPos, returns numSeqs + + cout << endl << "Checking sequences from group: " << fileGroup[thisFastaName] << "." << endl; + + set cnames; + driverMPI(0, num, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, thisFastaName, thisPriority, true); + numSeqs += num; + + MPI_File_close(&inMPI); + m->mothurRemove(thisFastaName); + + if (dups) { + if (cnames.size() != 0) { + if (hasCount) { + for (set::iterator it = cnames.begin(); it != cnames.end(); it++) { + string outputString = (*it) + "\t" + fileGroup[thisFastaName] + "\n"; + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + MPI_File_write_shared(outMPICount, buf2, length, MPI_CHAR, &status); + delete buf2; + } + }else { + map >::iterator itGroupNameMap = group2NameMap.find(fileGroup[thisFastaName]); + if (itGroupNameMap != group2NameMap.end()) { + map thisnamemap = itGroupNameMap->second; + map::iterator itN; + for (set::iterator it = cnames.begin(); it != cnames.end(); it++) { + itN = thisnamemap.find(*it); + if (itN != thisnamemap.end()) { + vector tempNames; m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { //write to accnos file + string outputString = tempNames[j] + "\n"; + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + + MPI_File_write_shared(outMPIAccnos, buf2, length, MPI_CHAR, &status); + delete buf2; + } + + }else { m->mothurOut("[ERROR]: parsing cannot find " + *it + ".\n"); m->control_pressed = true; } + } + }else { m->mothurOut("[ERROR]: parsing cannot find " + fileGroup[thisFastaName] + ".\n"); m->control_pressed = true; } + } + + } + } + + cout << endl << "It took " << toString(time(NULL) - start) << " secs to check " + toString(num) + " sequences from group " << fileGroup[thisFastaName] << "." << endl; + } + + if (pid == 0) { + for(int i = 1; i < processors; i++) { + int temp = 0; + MPI_Recv(&temp, 1, MPI_INT, i, 2001, MPI_COMM_WORLD, &status); + numSeqs += temp; + } + }else{ MPI_Send(&numSeqs, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD); } + + MPI_File_close(&outMPI); + MPI_File_close(&outMPIAccnos); + if (trim) { MPI_File_close(&outMPIFasta); } + if (hasCount && dups) { MPI_File_close(&outMPICount); } + + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait +#endif + return 0; + + }catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "MPIExecuteGroups"); + exit(1); + } +} +//********************************************************************************************************************** +int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, string accnosFileName, string trimFastaFileName, map& priority){ try { #ifdef USE_MPI @@ -765,7 +1045,7 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); if (trim) { MPI_File_open(MPI_COMM_WORLD, outFastaFilename, outMode, MPI_INFO_NULL, &outMPIFasta); } - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } if (pid == 0) { //you are the root process m->mothurOutEndLine(); @@ -802,19 +1082,10 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st } //do your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos); - - int numNoParents = chimera->getNumNoParents(); - int temp; - for(int i = 1; i < processors; i++) { - MPI_Recv(&temp, 1, MPI_INT, 1, tag, MPI_COMM_WORLD, &status); - numNoParents += temp; - } - - - if (numSeqs == numNoParents) { m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors."); m->mothurOutEndLine(); } - - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + set cnames; + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, inputFile, priority, false); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } }else{ //you are a child process if (templatefile != "self") { //if template=self we can only use 1 processor @@ -828,12 +1099,10 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } //do your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos); - - int numNoParents = chimera->getNumNoParents(); - MPI_Send(&numNoParents, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); + set cnames; + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, inputFile, priority, false); - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); delete chimera; return 0; } + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } } } @@ -847,7 +1116,7 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st #endif - return 0; + return numSeqs; } catch(exception& e) { m->errorOut(e, "ChimeraSlayerCommand", "MPIExecute"); @@ -855,11 +1124,20 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st } } //********************************************************************************************************************** -int ChimeraSlayerCommand::deconvoluteResults(SequenceParser* parser, string outputFileName, string accnosFileName, string trimFileName){ +int ChimeraSlayerCommand::deconvoluteResults(map& uniqueNames, string outputFileName, string accnosFileName, string trimFileName){ try { - map uniqueNames = parser->getAllSeqsMap(); map::iterator itUnique; int total = 0; + + if (trimera) { //add in more potential uniqueNames + map newUniqueNames = uniqueNames; + for (map::iterator it = uniqueNames.begin(); it != uniqueNames.end(); it++) { + newUniqueNames[(it->first)+"_LEFT"] = (it->first)+"_LEFT"; + newUniqueNames[(it->first)+"_RIGHT"] = (it->first)+"_RIGHT"; + } + uniqueNames = newUniqueNames; + newUniqueNames.clear(); + } //edit accnos file ifstream in2; @@ -945,9 +1223,10 @@ int ChimeraSlayerCommand::deconvoluteResults(SequenceParser* parser, string outp itChimeras = chimerasInFile.find(itUnique->second); if (itChimeras == chimerasInFile.end()) { + //is this sequence not already in the file itNames = namesInFile.find((itUnique->second)); - if (itNames == namesInFile.end()) {cout << itUnique->second << endl; out << itUnique->second << '\t' << "no" << endl; namesInFile.insert(itUnique->second); } + if (itNames == namesInFile.end()) { out << itUnique->second << '\t' << "no" << endl; namesInFile.insert(itUnique->second); } } } }else { //read the rest of the line @@ -972,13 +1251,14 @@ int ChimeraSlayerCommand::deconvoluteResults(SequenceParser* parser, string outp //then you really are a no so print, otherwise skip if (itChimeras == chimerasInFile.end()) { print = true; } + }else{ print = true; } } } if (print) { out << name << '\t'; - cout << name<< endl; + namesInFile.insert(name); //output parent1's name @@ -1044,29 +1324,102 @@ int ChimeraSlayerCommand::deconvoluteResults(SequenceParser* parser, string outp m->errorOut(e, "ChimeraSlayerCommand", "deconvoluteResults"); exit(1); } -} +} //********************************************************************************************************************** -int ChimeraSlayerCommand::setupChimera(string inputFile, map& priority){ +int ChimeraSlayerCommand::setUpForSelfReference(SequenceParser*& parser, map& fileGroup, map >& fileToPriority, int s){ try { - if (templatefile != "self") { //you want to run slayer with a reference template - chimera = new ChimeraSlayer(inputFile, templatefile, trim, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand()); + fileGroup.clear(); + fileToPriority.clear(); + + string nameFile = ""; + if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one + nameFile = nameFileNames[s]; + }else { nameFile = getNamesFile(fastaFileNames[s]); } + + //you provided a groupfile + string groupFile = ""; + if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; } + + if (groupFile == "") { + if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //sort fastafile by abundance, returns new sorted fastafile name + m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); + priority = sortFastaFile(fastaFileNames[s], nameFile); + m->mothurOut("Done."); m->mothurOutEndLine(); + + fileToPriority[fastaFileNames[s]] = priority; + fileGroup[fastaFileNames[s]] = "noGroup"; }else { - chimera = new ChimeraSlayer(inputFile, templatefile, trim, priority, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand()); + //Parse sequences by group + parser = new SequenceParser(groupFile, fastaFileNames[s], nameFile); + vector groups = parser->getNamesOfGroups(); + + for (int i = 0; i < groups.size(); i++) { + vector thisGroupsSeqs = parser->getSeqs(groups[i]); + map thisGroupsMap = parser->getNameMap(groups[i]); + group2NameMap[groups[i]] = thisGroupsMap; + string newFastaFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + groups[i] + "-sortedTemp.fasta"; + priority = sortFastaFile(thisGroupsSeqs, thisGroupsMap, newFastaFile); + fileToPriority[newFastaFile] = priority; + fileGroup[newFastaFile] = groups[i]; + } } - if (m->control_pressed) { delete chimera; return 0; } - if (chimera->getUnaligned()) { delete chimera; m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "setUpForSelfReference"); + exit(1); + } +} +//********************************************************************************************************************** +int ChimeraSlayerCommand::setUpForSelfReference(SequenceCountParser*& parser, map& fileGroup, map >& fileToPriority, int s){ + try { + fileGroup.clear(); + fileToPriority.clear(); + + string nameFile = ""; + if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one + nameFile = nameFileNames[s]; + }else { m->control_pressed = true; return 0; } + + CountTable ct; + if (!ct.testGroups(nameFile)) { + if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //sort fastafile by abundance, returns new sorted fastafile name + m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); + priority = sortFastaFile(fastaFileNames[s], nameFile); + m->mothurOut("Done."); m->mothurOutEndLine(); + + fileToPriority[fastaFileNames[s]] = priority; + fileGroup[fastaFileNames[s]] = "noGroup"; + }else { + //Parse sequences by group + parser = new SequenceCountParser(nameFile, fastaFileNames[s]); + vector groups = parser->getNamesOfGroups(); + + for (int i = 0; i < groups.size(); i++) { + vector thisGroupsSeqs = parser->getSeqs(groups[i]); + map thisGroupsMap = parser->getCountTable(groups[i]); + string newFastaFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + groups[i] + "-sortedTemp.fasta"; + sortFastaFile(thisGroupsSeqs, thisGroupsMap, newFastaFile); + fileToPriority[newFastaFile] = thisGroupsMap; + fileGroup[newFastaFile] = groups[i]; + } + } + - return (chimera->getLength()); + return 0; } catch(exception& e) { - m->errorOut(e, "ChimeraSlayerCommand", "setupChimera"); + m->errorOut(e, "ChimeraSlayerCommand", "setUpForSelfReference"); exit(1); } } //********************************************************************************************************************** - string ChimeraSlayerCommand::getNamesFile(string& inputFile){ try { string nameFile = ""; @@ -1077,14 +1430,15 @@ string ChimeraSlayerCommand::getNamesFile(string& inputFile){ string inputString = "fasta=" + inputFile; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - + m->mothurCalling = true; + Command* uniqueCommand = new DeconvoluteCommand(inputString); uniqueCommand->execute(); map > filenames = uniqueCommand->getOutputFiles(); delete uniqueCommand; - + m->mothurCalling = false; m->mothurOut("/******************************************/"); m->mothurOutEndLine(); nameFile = filenames["name"][0]; @@ -1099,8 +1453,279 @@ string ChimeraSlayerCommand::getNamesFile(string& inputFile){ } //********************************************************************************************************************** -int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string filename, string accnos, string fasta){ +int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup, string countlist){ + try { + int totalSeqs = 0; + ofstream outCountList; + + if (hasCount && dups) { m->openOutputFile(countlist, outCountList); } + + for (map >::iterator itFile = fileToPriority.begin(); itFile != fileToPriority.end(); itFile++) { + + if (m->control_pressed) { return 0; } + + int start = time(NULL); + string thisFastaName = itFile->first; + map thisPriority = itFile->second; + string thisoutputFileName = outputDir + m->getRootName(m->getSimpleName(thisFastaName)) + fileGroup[thisFastaName] + "slayer.chimera"; + string thisaccnosFileName = outputDir + m->getRootName(m->getSimpleName(thisFastaName)) + fileGroup[thisFastaName] + "slayer.accnos"; + string thistrimFastaFileName = outputDir + m->getRootName(m->getSimpleName(thisFastaName)) + fileGroup[thisFastaName] + "slayer.fasta"; + + m->mothurOutEndLine(); m->mothurOut("Checking sequences from group: " + fileGroup[thisFastaName] + "."); m->mothurOutEndLine(); + + lines.clear(); +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + int proc = 1; + vector positions = m->divideFile(thisFastaName, proc); + lines.push_back(linePair(positions[0], positions[1])); +#else + lines.push_back(linePair(0, 1000)); +#endif + int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority); + + //if we provided a count file with group info and set dereplicate=t, then we want to create a *.pick.count_table + //This table will zero out group counts for seqs determined to be chimeric by that group. + if (dups) { + if (!m->isBlank(thisaccnosFileName)) { + ifstream in; + m->openInputFile(thisaccnosFileName, in); + string name; + if (hasCount) { + while (!in.eof()) { + in >> name; m->gobble(in); + outCountList << name << '\t' << fileGroup[thisFastaName] << endl; + } + in.close(); + }else { + map >::iterator itGroupNameMap = group2NameMap.find(fileGroup[thisFastaName]); + if (itGroupNameMap != group2NameMap.end()) { + map thisnamemap = itGroupNameMap->second; + map::iterator itN; + ofstream out; + m->openOutputFile(thisaccnosFileName+".temp", out); + while (!in.eof()) { + in >> name; m->gobble(in); + itN = thisnamemap.find(name); + if (itN != thisnamemap.end()) { + vector tempNames; m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; } + + }else { m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); m->control_pressed = true; } + } + out.close(); + in.close(); + m->renameFile(thisaccnosFileName+".temp", thisaccnosFileName); + }else { m->mothurOut("[ERROR]: parsing cannot find " + fileGroup[thisFastaName] + ".\n"); m->control_pressed = true; } + } + + } + } + + //append files + m->appendFiles(thisoutputFileName, outputFName); m->mothurRemove(thisoutputFileName); + m->appendFiles(thisaccnosFileName, accnos); m->mothurRemove(thisaccnosFileName); + if (trim) { m->appendFiles(thistrimFastaFileName, fasta); m->mothurRemove(thistrimFastaFileName); } + m->mothurRemove(thisFastaName); + + totalSeqs += numSeqs; + + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + fileGroup[thisFastaName] + "."); m->mothurOutEndLine(); + } + + if (hasCount && dups) { outCountList.close(); } + + return totalSeqs; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "driverGroups"); + exit(1); + } +} +/**************************************************************************************************/ +int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup, string countlist, string countFile) { + try { + int process = 1; + int num = 0; + processIDS.clear(); + + if (fileToPriority.size() < processors) { processors = fileToPriority.size(); } + + CountTable newCount; + if (hasCount && dups) { newCount.readTable(countFile, true); } + + int groupsPerProcessor = fileToPriority.size() / processors; + int remainder = fileToPriority.size() % processors; + + vector< map > > breakUp; + + for (int i = 0; i < processors; i++) { + map > thisFileToPriority; + map >::iterator itFile; + int count = 0; + int enough = groupsPerProcessor; + if (i == 0) { enough = groupsPerProcessor + remainder; } + + for (itFile = fileToPriority.begin(); itFile != fileToPriority.end();) { + thisFileToPriority[itFile->first] = itFile->second; + fileToPriority.erase(itFile++); + count++; + if (count == enough) { break; } + } + breakUp.push_back(thisFileToPriority); + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", fasta + toString(getpid()) + ".temp", breakUp[process], fileGroup, accnos + toString(getpid()) + ".byCount"); + + //pass numSeqs to parent + ofstream out; + string tempFile = outputFName + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + num = driverGroups(outputFName, accnos, fasta, breakUp[0], fileGroup, accnos + ".byCount"); + + //force parent to wait until all the processes are done + for (int i=0;iopenInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); m->mothurRemove(tempFile); + } +#else + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the slayerData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for(int i=1; ifileToPriority.size() != pDataArray[i]->end) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->end) + " of " + toString(pDataArray[i]->fileToPriority.size()) + " groups assigned to it, quitting. \n"); m->control_pressed = true; + } + num += pDataArray[i]->count; + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } +#endif + //read my own + if (hasCount && dups) { + if (!m->isBlank(accnos + ".byCount")) { + ifstream in2; + m->openInputFile(accnos + ".byCount", in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + newCount.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(accnos + ".byCount"); + } + + + //append output files + for(int i=0;iappendFiles((outputFName + toString(processIDS[i]) + ".temp"), outputFName); + m->mothurRemove((outputFName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((accnos + toString(processIDS[i]) + ".temp"), accnos); + m->mothurRemove((accnos + toString(processIDS[i]) + ".temp")); + + if (trim) { + m->appendFiles((fasta + toString(processIDS[i]) + ".temp"), fasta); + m->mothurRemove((fasta + toString(processIDS[i]) + ".temp")); + } + + if (hasCount && dups) { + if (!m->isBlank(accnos + toString(processIDS[i]) + ".byCount")) { + ifstream in2; + m->openInputFile(accnos + toString(processIDS[i]) + ".byCount", in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + newCount.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(accnos + toString(processIDS[i]) + ".byCount"); + } + + } + + //print new *.pick.count_table + if (hasCount && dups) { newCount.printTable(countlist); } + + return num; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "createProcessesGroups"); + exit(1); + } +} +//********************************************************************************************************************** + +int ChimeraSlayerCommand::driver(linePair filePos, string outputFName, string filename, string accnos, string fasta, map& priority){ try { + + Chimera* chimera; + if (templatefile != "self") { //you want to run slayer with a reference template + chimera = new ChimeraSlayer(filename, templatefile, trim, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand()); + }else { + chimera = new ChimeraSlayer(filename, templatefile, trim, priority, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand()); + } + + if (m->control_pressed) { delete chimera; return 0; } + + if (chimera->getUnaligned()) { delete chimera; m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + templateSeqsLength = chimera->getLength(); + ofstream out; m->openOutputFile(outputFName, out); @@ -1113,16 +1738,16 @@ int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string f ifstream inFASTA; m->openInputFile(filename, inFASTA); - inFASTA.seekg(filePos->start); + inFASTA.seekg(filePos.start); - if (filePos->start == 0) { chimera->printHeader(out); } + if (filePos.start == 0) { chimera->printHeader(out); } bool done = false; int count = 0; while (!done) { - if (m->control_pressed) { out.close(); out2.close(); if (trim) { out3.close(); } inFASTA.close(); return 1; } + if (m->control_pressed) { delete chimera; out.close(); out2.close(); if (trim) { out3.close(); } inFASTA.close(); return 1; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); string candidateAligned = candidateSeq->getAligned(); @@ -1134,7 +1759,7 @@ int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string f //find chimeras chimera->getChimeras(candidateSeq); - if (m->control_pressed) { delete candidateSeq; return 1; } + if (m->control_pressed) { delete chimera; delete candidateSeq; return 1; } //if you are not chimeric, then check each half data_results wholeResults = chimera->getResults(); @@ -1189,26 +1814,32 @@ int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string f count++; } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = inFASTA.tellg(); - if ((pos == -1) || (pos >= filePos->end)) { break; } + if ((pos == -1) || (pos >= filePos.end)) { break; } #else if (inFASTA.eof()) { break; } #endif delete candidateSeq; //report progress - if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count) + "\n"); } } //report progress - if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+ "\n"); } + + int numNoParents = chimera->getNumNoParents(); + if (numNoParents == count) { m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors."); m->mothurOutEndLine(); } out.close(); out2.close(); if (trim) { out3.close(); } inFASTA.close(); + delete chimera; return count; + + } catch(exception& e) { m->errorOut(e, "ChimeraSlayerCommand", "driver"); @@ -1217,15 +1848,27 @@ int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string f } //********************************************************************************************************************** #ifdef USE_MPI -int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector& MPIPos){ - try { +int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, set& cnames, vector& MPIPos, string filename, map& priority, bool byGroup){ + try { MPI_Status status; int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + Chimera* chimera; + if (templatefile != "self") { //you want to run slayer with a reference template + chimera = new ChimeraSlayer(filename, templatefile, trim, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand()); + }else { + chimera = new ChimeraSlayer(filename, templatefile, trim, priority, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand(), byGroup); + } + + if (m->control_pressed) { delete chimera; return 0; } + + if (chimera->getUnaligned()) { delete chimera; m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + templateSeqsLength = chimera->getLength(); + for(int i=0;icontrol_pressed) { return 1; } + if (m->control_pressed) { delete chimera; return 1; } //read next sequence int length = MPIPos[start+i+1] - MPIPos[start+i]; @@ -1251,7 +1894,7 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil //find chimeras chimera->getChimeras(candidateSeq); - if (m->control_pressed) { delete candidateSeq; return 1; } + if (m->control_pressed) { delete chimera; delete candidateSeq; return 1; } //if you are not chimeric, then check each half data_results wholeResults = chimera->getResults(); @@ -1289,7 +1932,10 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil data_results rightResults = chimera->getResults(); //if either piece is chimeric then report - Sequence trimmed = chimera->print(outMPI, outAccMPI, leftResults, rightResults); + bool flag = false; + Sequence trimmed = chimera->print(outMPI, outAccMPI, leftResults, rightResults, flag); + if (flag) { cnames.insert(candidateSeq->getName()); } + if (trim) { string outputString = ">" + trimmed.getName() + "\n" + trimmed.getAligned() + "\n"; @@ -1307,6 +1953,7 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil }else { //print results Sequence trimmed = chimera->print(outMPI, outAccMPI); + cnames.insert(candidateSeq->getName()); if (trim) { string outputString = ">" + trimmed.getName() + "\n" + trimmed.getAligned() + "\n"; @@ -1326,12 +1973,15 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil delete candidateSeq; //report progress - if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); } + if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; } } //report progress - if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); } + if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; } - + int numNoParents = chimera->getNumNoParents(); + if (numNoParents == num) { cout << "[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors." << endl; } + + delete chimera; return 0; } catch(exception& e) { @@ -1343,14 +1993,13 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil /**************************************************************************************************/ -int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename, string accnos, string fasta) { +int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename, string accnos, string fasta, map& thisPriority) { try { int process = 0; int num = 0; - int numNoParents = 0; processIDS.clear(); -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //loop through and create all the processes you want while (process != processors) { int pid = fork(); @@ -1359,13 +2008,13 @@ int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp", fasta + toString(getpid()) + ".temp"); + num = driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp", fasta + toString(getpid()) + ".temp", thisPriority); //pass numSeqs to parent ofstream out; string tempFile = outputFileName + toString(getpid()) + ".num.temp"; m->openOutputFile(tempFile, out); - out << num << '\t' << chimera->getNumNoParents() << endl; + out << num << endl; out.close(); exit(0); }else { @@ -1385,7 +2034,7 @@ int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename ifstream in; string tempFile = outputFileName + toString(processIDS[i]) + ".num.temp"; m->openInputFile(tempFile, in); - if (!in.eof()) { int tempNum = 0; int tempNumParents = 0; in >> tempNum >> tempNumParents; num += tempNum; numNoParents += tempNumParents; } + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } in.close(); m->mothurRemove(tempFile); } #else @@ -1402,11 +2051,11 @@ int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename //Create processor worker threads. for( int i=0; istart, lines[i]->end, ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, divR, priority, i); + slayerData* tempslayer = new slayerData((outputFileName + extension), (fasta + extension), (accnos + extension), filename, templatefile, search, blastlocation, trimera, trim, realign, m, lines[i].start, lines[i].end, ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, divR, priority, i); pDataArray.push_back(tempslayer); processIDS.push_back(i); - //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. + //MySlayerThreadFunction is in header. It must be global or static to work with the threads. //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier hThreadArray[i] = CreateThread(NULL, 0, MySlayerThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); } @@ -1417,12 +2066,10 @@ int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename //Close all thread handles and free memory allocations. for(int i=0; i < pDataArray.size(); i++){ num += pDataArray[i]->count; - numNoParents += pDataArray[i]->numNoParents; CloseHandle(hThreadArray[i]); delete pDataArray[i]; } #endif - if (num == numNoParents) { m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors."); m->mothurOutEndLine(); } rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); rename((accnos + toString(processIDS[0]) + ".temp").c_str(), accnos.c_str()); @@ -1509,9 +2156,22 @@ map ChimeraSlayerCommand::sortFastaFile(string fastaFile, string na in.close(); - //read namefile + //read namefile or countfile vector nameMapCount; - int error = m->readNames(nameFile, nameMapCount, seqs); + int error; + if (hasCount) { + CountTable ct; + ct.readTable(nameFile, true); + + for(map::iterator it = seqs.begin(); it != seqs.end(); it++) { + int num = ct.getNumSeqs(it->first); + if (num == 0) { error = 1; } + else { + seqPriorityNode temp(num, it->second, it->first); + nameMapCount.push_back(temp); + } + } + }else { error = m->readNames(nameFile, nameMapCount, seqs); } if (m->control_pressed) { return nameAbund; } @@ -1593,4 +2253,51 @@ map ChimeraSlayerCommand::sortFastaFile(vector& thisseqs, } } /**************************************************************************************************/ +int ChimeraSlayerCommand::sortFastaFile(vector& thisseqs, map& countMap, string newFile) { + try { + vector nameVector; + + //read through fastafile and store info + map seqs; + + for (int i = 0; i < thisseqs.size(); i++) { + + if (m->control_pressed) { return 0; } + + map::iterator itCountMap = countMap.find(thisseqs[i].getName()); + + if (itCountMap == countMap.end()){ + m->control_pressed = true; + m->mothurOut("[ERROR]: " + thisseqs[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine(); + }else { + seqPriorityNode temp(itCountMap->second, thisseqs[i].getAligned(), thisseqs[i].getName()); + nameVector.push_back(temp); + } + } + + //sort by num represented + sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes); + + if (m->control_pressed) { return 0; } + + if (thisseqs.size() != nameVector.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your count file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + + ofstream out; + m->openOutputFile(newFile, out); + + //print new file in order of + for (int i = 0; i < nameVector.size(); i++) { + out << ">" << nameVector[i].name << endl << nameVector[i].seq << endl; + } + out.close(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "sortFastaFile"); + exit(1); + } +} +/**************************************************************************************************/