X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=chimeraslayercommand.cpp;h=41661da085060fcd66913f31764232dbbacce342;hp=e2b93316a4ade033f6ce4db3988cfd09cbf432e9;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=ffc44592ff7ae94f14f9e21f87198e33d323cd1d diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index e2b9331..41661da 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -11,36 +11,40 @@ #include "deconvolutecommand.h" #include "referencedb.h" #include "sequenceparser.h" +#include "counttable.h" //********************************************************************************************************************** vector ChimeraSlayerCommand::setParameters(){ try { - CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pwindow("window", "Number", "", "50", "", "", "",false,false); parameters.push_back(pwindow); - CommandParameter pksize("ksize", "Number", "", "7", "", "", "",false,false); parameters.push_back(pksize); - CommandParameter pmatch("match", "Number", "", "5.0", "", "", "",false,false); parameters.push_back(pmatch); - CommandParameter pmismatch("mismatch", "Number", "", "-4.0", "", "", "",false,false); parameters.push_back(pmismatch); - CommandParameter pminsim("minsim", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminsim); - CommandParameter pmincov("mincov", "Number", "", "70", "", "", "",false,false); parameters.push_back(pmincov); - CommandParameter pminsnp("minsnp", "Number", "", "10", "", "", "",false,false); parameters.push_back(pminsnp); - CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs); - CommandParameter psearch("search", "Multiple", "kmer-blast", "blast", "", "", "",false,false); parameters.push_back(psearch); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter prealign("realign", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(prealign); - CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim); - CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit); - CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted); - CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); - CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "",false,false); parameters.push_back(pdivergence); - CommandParameter pparents("parents", "Number", "", "3", "", "", "",false,false); parameters.push_back(pparents); - CommandParameter pincrement("increment", "Number", "", "5", "", "", "",false,false); parameters.push_back(pincrement); - CommandParameter pblastlocation("blastlocation", "String", "", "", "", "", "",false,false); parameters.push_back(pblastlocation); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); - CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); + CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(ptemplate); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","chimera-accnos",false,true,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup); + CommandParameter pwindow("window", "Number", "", "50", "", "", "","",false,false); parameters.push_back(pwindow); + CommandParameter pksize("ksize", "Number", "", "7", "", "", "","",false,false); parameters.push_back(pksize); + CommandParameter pmatch("match", "Number", "", "5.0", "", "", "","",false,false); parameters.push_back(pmatch); + CommandParameter pmismatch("mismatch", "Number", "", "-4.0", "", "", "","",false,false); parameters.push_back(pmismatch); + CommandParameter pminsim("minsim", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pminsim); + CommandParameter pmincov("mincov", "Number", "", "70", "", "", "","",false,false); parameters.push_back(pmincov); + CommandParameter pminsnp("minsnp", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pminsnp); + CommandParameter pminbs("minbs", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pminbs); + CommandParameter psearch("search", "Multiple", "kmer-blast", "blast", "", "", "","",false,false); parameters.push_back(psearch); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + + CommandParameter prealign("realign", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(prealign); + CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "","fasta",false,false); parameters.push_back(ptrim); + CommandParameter psplit("split", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psplit); + CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "","",false,false); parameters.push_back(pnumwanted); + CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters); + CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "","",false,false); parameters.push_back(pdivergence); + CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups); + CommandParameter pparents("parents", "Number", "", "3", "", "", "","",false,false); parameters.push_back(pparents); + CommandParameter pincrement("increment", "Number", "", "5", "", "", "","",false,false); parameters.push_back(pincrement); + CommandParameter pblastlocation("blastlocation", "String", "", "", "", "", "","",false,false); parameters.push_back(pblastlocation); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -57,16 +61,18 @@ string ChimeraSlayerCommand::getHelpString(){ string helpString = ""; helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n"; - helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n"; + helpString += "The chimera.slayer command parameters are fasta, name, group, template, processors, dereplicate, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file, if you are using reference=self. \n"; helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; + helpString += "The count parameter allows you to provide a count file. The count file reference=self. If your count file contains group information, when checking sequences, only sequences from the same group as the query sequence will be used as the reference. When you use a count file with group info and dereplicate=T, mothur will create a *.pick.count_table file containing seqeunces after chimeras are removed. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; #ifdef USE_MPI helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n"; #endif + helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n"; helpString += "The trim parameter allows you to output a new fasta file containing your sequences with the chimeric ones trimmed to include only their longest piece, default=F. \n"; helpString += "The split parameter allows you to check both pieces of non-chimeric sequence for chimeras, thus looking for trimeras and quadmeras. default=F. \n"; helpString += "The window parameter allows you to specify the window size for searching for chimeras, default=50. \n"; @@ -98,6 +104,24 @@ string ChimeraSlayerCommand::getHelpString(){ } } //********************************************************************************************************************** +string ChimeraSlayerCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "chimera") { pattern = "[filename],slayer.chimeras"; } + else if (type == "accnos") { pattern = "[filename],slayer.accnos"; } + else if (type == "fasta") { pattern = "[filename],slayer.fasta"; } + else if (type == "count") { pattern = "[filename],slayer.pick.count_table"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** ChimeraSlayerCommand::ChimeraSlayerCommand(){ try { abort = true; calledHelp = true; @@ -106,6 +130,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(){ outputTypes["chimera"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ChimeraSlayerCommand", "ChimeraSlayerCommand"); @@ -117,6 +142,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { try { abort = false; calledHelp = false; ReferenceDB* rdb = ReferenceDB::getInstance(); + hasCount = false; + hasName = false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -140,6 +167,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { outputTypes["chimera"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -225,9 +253,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { //check for required parameters - bool hasName = true; namefile = validParameter.validFile(parameters, "name", false); - if (namefile == "not found") { namefile = ""; hasName = false; } + if (namefile == "not found") { namefile = ""; } else { m->splitAtDash(namefile, nameFileNames); @@ -294,12 +321,91 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { } } } + } + + if (nameFileNames.size() != 0) { hasName = true; } + + //check for required parameters + vector countfileNames; + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); - //make sure there is at least one valid file left - if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (nameFileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } } - - if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + if (countfileNames.size() != 0) { hasCount = true; } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if (!hasName && hasCount) { nameFileNames = countfileNames; } + + if ((hasCount || hasName) && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } bool hasGroup = true; groupfile = validParameter.validFile(parameters, "group", false); @@ -377,7 +483,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } - + if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -427,6 +533,12 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { m->mothurOutEndLine(); save = false; } + }else if (hasCount) { templatefile = "self"; + if (save) { + m->mothurOut("[WARNING]: You can't save reference=self, ignoring save."); + m->mothurOutEndLine(); + save = false; + } } else { if (rdb->referenceSeqs.size() != 0) { @@ -489,6 +601,10 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "15"; } m->mothurConvert(temp, numwanted); + + temp = validParameter.validFile(parameters, "dereplicate", false); + if (temp == "not found") { temp = "false"; } + dups = m->isTrue(temp); blastlocation = validParameter.validFile(parameters, "blastlocation", false); if (blastlocation == "not found") { blastlocation = ""; } @@ -529,7 +645,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { if ((search != "blast") && (search != "kmer")) { m->mothurOut(search + " is not a valid search."); m->mothurOutEndLine(); abort = true; } - if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } + if ((hasName || hasCount) && (templatefile != "self")) { m->mothurOut("You have provided a namefile or countfile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } //until we resolve the issue 10-18-11 @@ -555,10 +671,13 @@ int ChimeraSlayerCommand::execute(){ m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); int start = time(NULL); - if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it - string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.chimera"; - string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.accnos"; - string trimFastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.fasta"; + if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])); + string outputFileName = getOutputFileName("chimera", variables); + string accnosFileName = getOutputFileName("accnos", variables); + string trimFastaFileName = getOutputFileName("fasta", variables); + string newCountFile = ""; //clears files ofstream out, out1, out2; @@ -577,13 +696,23 @@ int ChimeraSlayerCommand::execute(){ map fileGroup; fileToPriority[fastaFileNames[s]] = priority; //default fileGroup[fastaFileNames[s]] = "noGroup"; - SequenceParser* parser = NULL; + map uniqueNames; int totalChimeras = 0; lines.clear(); - if (templatefile == "self") { setUpForSelfReference(parser, fileGroup, fileToPriority, s); } + if (templatefile == "self") { + if (hasCount) { + SequenceCountParser* parser = NULL; + setUpForSelfReference(parser, fileGroup, fileToPriority, s); + if (parser != NULL) { uniqueNames = parser->getAllSeqsMap(); delete parser; } + }else { + SequenceParser* parser = NULL; + setUpForSelfReference(parser, fileGroup, fileToPriority, s); + if (parser != NULL) { uniqueNames = parser->getAllSeqsMap(); delete parser; } + } + } - if (m->control_pressed) { if (parser != NULL) { delete parser; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } if (fileToPriority.size() == 1) { //you running without a groupfile itFile = fileToPriority.begin(); @@ -615,14 +744,39 @@ int ChimeraSlayerCommand::execute(){ if(processors == 1){ numSeqs = driver(lines[0], outputFileName, thisFastaName, accnosFileName, trimFastaFileName, thisPriority); } else{ numSeqs = createProcesses(outputFileName, thisFastaName, accnosFileName, trimFastaFileName, thisPriority); } - if (m->control_pressed) { if (parser != NULL) { delete parser; } outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + if (m->control_pressed) { outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } #endif }else { //you have provided a groupfile + string countFile = ""; + if (hasCount) { + countFile = nameFileNames[s]; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFileNames[s])); + newCountFile = getOutputFileName("count", variables); + } #ifdef USE_MPI - MPIExecuteGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup); + MPIExecuteGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile, countFile); #else - if (processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup); } - else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup); } //destroys fileToPriority + if (processors == 1) { + numSeqs = driverGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile); + if (hasCount && dups) { + CountTable c; c.readTable(nameFileNames[s], true, false); + if (!m->isBlank(newCountFile)) { + ifstream in2; + m->openInputFile(newCountFile, in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + c.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(newCountFile); + c.printTable(newCountFile); + } + + } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, trimFastaFileName, fileToPriority, fileGroup, newCountFile, countFile); } //destroys fileToPriority #endif #ifdef USE_MPI @@ -631,16 +785,37 @@ int ChimeraSlayerCommand::execute(){ if (pid == 0) { #endif - - totalChimeras = deconvoluteResults(parser, outputFileName, accnosFileName, trimFastaFileName); - m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine(); + if (!dups) { + totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName); + m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine(); + }else { + if (hasCount) { + set doNotRemove; + CountTable c; c.readTable(newCountFile, true, true); + vector namesInTable = c.getNamesOfSeqs(); + for (int i = 0; i < namesInTable.size(); i++) { + int temp = c.getNumSeqs(namesInTable[i]); + if (temp == 0) { c.remove(namesInTable[i]); } + else { doNotRemove.insert((namesInTable[i])); } + } + //remove names we want to keep from accnos file. + set accnosNames = m->readAccnos(accnosFileName); + ofstream out2; + m->openOutputFile(accnosFileName, out2); + for (set::iterator it = accnosNames.begin(); it != accnosNames.end(); it++) { + if (doNotRemove.count(*it) == 0) { out2 << (*it) << endl; } + } + out2.close(); + c.printTable(newCountFile); + outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); + } + } + #ifdef USE_MPI } MPI_Barrier(MPI_COMM_WORLD); //make everyone wait #endif } - - if (parser != NULL) { delete parser; } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); } @@ -659,6 +834,11 @@ int ChimeraSlayerCommand::execute(){ } } + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } @@ -673,7 +853,7 @@ int ChimeraSlayerCommand::execute(){ } } //********************************************************************************************************************** -int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosFileName, string trimFastaFileName, map >& fileToPriority, map& fileGroup){ +int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosFileName, string trimFastaFileName, map >& fileToPriority, map& fileGroup, string countlist, string countfile){ try { #ifdef USE_MPI int pid; @@ -699,6 +879,7 @@ int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosF MPI_File outMPI; MPI_File outMPIAccnos; MPI_File outMPIFasta; + MPI_File outMPICount; int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; int inMode=MPI_MODE_RDONLY; @@ -711,12 +892,16 @@ int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosF char outFastaFilename[1024]; strcpy(outFastaFilename, trimFastaFileName.c_str()); + + char outCountFilename[1024]; + strcpy(outCountFilename, countlist.c_str()); MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI); MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos); if (trim) { MPI_File_open(MPI_COMM_WORLD, outFastaFilename, outMode, MPI_INFO_NULL, &outMPIFasta); } + if (hasCount && dups) { MPI_File_open(MPI_COMM_WORLD, outCountFilename, outMode, MPI_INFO_NULL, &outMPICount); } - if (m->control_pressed) { MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } + if (m->control_pressed) { MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); if (hasCount && dups) { MPI_File_close(&outMPICount); } return 0; } //print headers if (pid == 0) { //you are the root process @@ -747,16 +932,55 @@ int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosF strcpy(inFileName, thisFastaName.c_str()); MPI_File inMPI; MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - + MPIPos = m->setFilePosFasta(thisFastaName, num); //fills MPIPos, returns numSeqs cout << endl << "Checking sequences from group: " << fileGroup[thisFastaName] << "." << endl; - driverMPI(0, num, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos, thisFastaName, thisPriority, true); + set cnames; + driverMPI(0, num, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, thisFastaName, thisPriority, true); numSeqs += num; MPI_File_close(&inMPI); m->mothurRemove(thisFastaName); + + if (dups) { + if (cnames.size() != 0) { + if (hasCount) { + for (set::iterator it = cnames.begin(); it != cnames.end(); it++) { + string outputString = (*it) + "\t" + fileGroup[thisFastaName] + "\n"; + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + MPI_File_write_shared(outMPICount, buf2, length, MPI_CHAR, &status); + delete buf2; + } + }else { + map >::iterator itGroupNameMap = group2NameMap.find(fileGroup[thisFastaName]); + if (itGroupNameMap != group2NameMap.end()) { + map thisnamemap = itGroupNameMap->second; + map::iterator itN; + for (set::iterator it = cnames.begin(); it != cnames.end(); it++) { + itN = thisnamemap.find(*it); + if (itN != thisnamemap.end()) { + vector tempNames; m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { //write to accnos file + string outputString = tempNames[j] + "\n"; + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + + MPI_File_write_shared(outMPIAccnos, buf2, length, MPI_CHAR, &status); + delete buf2; + } + + }else { m->mothurOut("[ERROR]: parsing cannot find " + *it + ".\n"); m->control_pressed = true; } + } + }else { m->mothurOut("[ERROR]: parsing cannot find " + fileGroup[thisFastaName] + ".\n"); m->control_pressed = true; } + } + + } + } cout << endl << "It took " << toString(time(NULL) - start) << " secs to check " + toString(num) + " sequences from group " << fileGroup[thisFastaName] << "." << endl; } @@ -772,6 +996,7 @@ int ChimeraSlayerCommand::MPIExecuteGroups(string outputFileName, string accnosF MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); if (trim) { MPI_File_close(&outMPIFasta); } + if (hasCount && dups) { MPI_File_close(&outMPICount); } MPI_Barrier(MPI_COMM_WORLD); //make everyone wait #endif @@ -857,7 +1082,8 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st } //do your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos, inputFile, priority, false); + set cnames; + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, inputFile, priority, false); if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } @@ -873,7 +1099,8 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; } //do your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos, inputFile, priority, false); + set cnames; + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, cnames, MPIPos, inputFile, priority, false); if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); return 0; } @@ -897,9 +1124,8 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st } } //********************************************************************************************************************** -int ChimeraSlayerCommand::deconvoluteResults(SequenceParser* parser, string outputFileName, string accnosFileName, string trimFileName){ +int ChimeraSlayerCommand::deconvoluteResults(map& uniqueNames, string outputFileName, string accnosFileName, string trimFileName){ try { - map uniqueNames = parser->getAllSeqsMap(); map::iterator itUnique; int total = 0; @@ -1132,6 +1358,7 @@ int ChimeraSlayerCommand::setUpForSelfReference(SequenceParser*& parser, map thisGroupsSeqs = parser->getSeqs(groups[i]); map thisGroupsMap = parser->getNameMap(groups[i]); + group2NameMap[groups[i]] = thisGroupsMap; string newFastaFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + groups[i] + "-sortedTemp.fasta"; priority = sortFastaFile(thisGroupsSeqs, thisGroupsMap, newFastaFile); fileToPriority[newFastaFile] = priority; @@ -1147,7 +1374,51 @@ int ChimeraSlayerCommand::setUpForSelfReference(SequenceParser*& parser, map& fileGroup, map >& fileToPriority, int s){ + try { + fileGroup.clear(); + fileToPriority.clear(); + + string nameFile = ""; + if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one + nameFile = nameFileNames[s]; + }else { m->control_pressed = true; return 0; } + + CountTable ct; + if (!ct.testGroups(nameFile)) { + if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //sort fastafile by abundance, returns new sorted fastafile name + m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); + priority = sortFastaFile(fastaFileNames[s], nameFile); + m->mothurOut("Done."); m->mothurOutEndLine(); + + fileToPriority[fastaFileNames[s]] = priority; + fileGroup[fastaFileNames[s]] = "noGroup"; + }else { + //Parse sequences by group + parser = new SequenceCountParser(nameFile, fastaFileNames[s]); + vector groups = parser->getNamesOfGroups(); + + for (int i = 0; i < groups.size(); i++) { + vector thisGroupsSeqs = parser->getSeqs(groups[i]); + map thisGroupsMap = parser->getCountTable(groups[i]); + string newFastaFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + groups[i] + "-sortedTemp.fasta"; + sortFastaFile(thisGroupsSeqs, thisGroupsMap, newFastaFile); + fileToPriority[newFastaFile] = thisGroupsMap; + fileGroup[newFastaFile] = groups[i]; + } + } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "setUpForSelfReference"); + exit(1); + } +} //********************************************************************************************************************** string ChimeraSlayerCommand::getNamesFile(string& inputFile){ try { @@ -1182,9 +1453,12 @@ string ChimeraSlayerCommand::getNamesFile(string& inputFile){ } //********************************************************************************************************************** -int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup){ +int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup, string countlist){ try { int totalSeqs = 0; + ofstream outCountList; + + if (hasCount && dups) { m->openOutputFile(countlist, outCountList); } for (map >::iterator itFile = fileToPriority.begin(); itFile != fileToPriority.end(); itFile++) { @@ -1209,6 +1483,44 @@ int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string #endif int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority); + //if we provided a count file with group info and set dereplicate=t, then we want to create a *.pick.count_table + //This table will zero out group counts for seqs determined to be chimeric by that group. + if (dups) { + if (!m->isBlank(thisaccnosFileName)) { + ifstream in; + m->openInputFile(thisaccnosFileName, in); + string name; + if (hasCount) { + while (!in.eof()) { + in >> name; m->gobble(in); + outCountList << name << '\t' << fileGroup[thisFastaName] << endl; + } + in.close(); + }else { + map >::iterator itGroupNameMap = group2NameMap.find(fileGroup[thisFastaName]); + if (itGroupNameMap != group2NameMap.end()) { + map thisnamemap = itGroupNameMap->second; + map::iterator itN; + ofstream out; + m->openOutputFile(thisaccnosFileName+".temp", out); + while (!in.eof()) { + in >> name; m->gobble(in); + itN = thisnamemap.find(name); + if (itN != thisnamemap.end()) { + vector tempNames; m->splitAtComma(itN->second, tempNames); + for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; } + + }else { m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); m->control_pressed = true; } + } + out.close(); + in.close(); + m->renameFile(thisaccnosFileName+".temp", thisaccnosFileName); + }else { m->mothurOut("[ERROR]: parsing cannot find " + fileGroup[thisFastaName] + ".\n"); m->control_pressed = true; } + } + + } + } + //append files m->appendFiles(thisoutputFileName, outputFName); m->mothurRemove(thisoutputFileName); m->appendFiles(thisaccnosFileName, accnos); m->mothurRemove(thisaccnosFileName); @@ -1220,6 +1532,8 @@ int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + fileGroup[thisFastaName] + "."); m->mothurOutEndLine(); } + if (hasCount && dups) { outCountList.close(); } + return totalSeqs; } catch(exception& e) { @@ -1228,13 +1542,16 @@ int ChimeraSlayerCommand::driverGroups(string outputFName, string accnos, string } } /**************************************************************************************************/ -int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup) { +int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accnos, string fasta, map >& fileToPriority, map& fileGroup, string countlist, string countFile) { try { int process = 1; int num = 0; processIDS.clear(); if (fileToPriority.size() < processors) { processors = fileToPriority.size(); } + + CountTable newCount; + if (hasCount && dups) { newCount.readTable(countFile, true, false); } int groupsPerProcessor = fileToPriority.size() / processors; int remainder = fileToPriority.size() % processors; @@ -1266,7 +1583,7 @@ int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accno processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", fasta + toString(getpid()) + ".temp", breakUp[process], fileGroup); + num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", fasta + toString(getpid()) + ".temp", breakUp[process], fileGroup, accnos + toString(getpid()) + ".byCount"); //pass numSeqs to parent ofstream out; @@ -1282,7 +1599,7 @@ int ChimeraSlayerCommand::createProcessesGroups(string outputFName, string accno } } - num = driverGroups(outputFName, accnos, fasta, breakUp[0], fileGroup); + num = driverGroups(outputFName, accnos, fasta, breakUp[0], fileGroup, accnos + ".byCount"); //force parent to wait until all the processes are done for (int i=0;ifileToPriority.size() != pDataArray[i]->end) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->end) + " of " + toString(pDataArray[i]->fileToPriority.size()) + " groups assigned to it, quitting. \n"); m->control_pressed = true; + } num += pDataArray[i]->count; CloseHandle(hThreadArray[i]); delete pDataArray[i]; } #endif + //read my own + if (hasCount && dups) { + if (!m->isBlank(accnos + ".byCount")) { + ifstream in2; + m->openInputFile(accnos + ".byCount", in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + newCount.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(accnos + ".byCount"); + } + //append output files for(int i=0;iappendFiles((fasta + toString(processIDS[i]) + ".temp"), fasta); m->mothurRemove((fasta + toString(processIDS[i]) + ".temp")); } + + if (hasCount && dups) { + if (!m->isBlank(accnos + toString(processIDS[i]) + ".byCount")) { + ifstream in2; + m->openInputFile(accnos + toString(processIDS[i]) + ".byCount", in2); + + string name, group; + while (!in2.eof()) { + in2 >> name >> group; m->gobble(in2); + newCount.setAbund(name, group, 0); + } + in2.close(); + } + m->mothurRemove(accnos + toString(processIDS[i]) + ".byCount"); + } + } + //print new *.pick.count_table + if (hasCount && dups) { newCount.printTable(countlist); } return num; } @@ -1469,10 +1823,10 @@ int ChimeraSlayerCommand::driver(linePair filePos, string outputFName, string fi delete candidateSeq; //report progress - if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count) + "\n"); } } //report progress - if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+ "\n"); } int numNoParents = chimera->getNumNoParents(); if (numNoParents == count) { m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors."); m->mothurOutEndLine(); } @@ -1494,7 +1848,7 @@ int ChimeraSlayerCommand::driver(linePair filePos, string outputFName, string fi } //********************************************************************************************************************** #ifdef USE_MPI -int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector& MPIPos, string filename, map& priority, bool byGroup){ +int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, set& cnames, vector& MPIPos, string filename, map& priority, bool byGroup){ try { MPI_Status status; int pid; @@ -1578,7 +1932,10 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil data_results rightResults = chimera->getResults(); //if either piece is chimeric then report - Sequence trimmed = chimera->print(outMPI, outAccMPI, leftResults, rightResults); + bool flag = false; + Sequence trimmed = chimera->print(outMPI, outAccMPI, leftResults, rightResults, flag); + if (flag) { cnames.insert(candidateSeq->getName()); } + if (trim) { string outputString = ">" + trimmed.getName() + "\n" + trimmed.getAligned() + "\n"; @@ -1596,6 +1953,7 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil }else { //print results Sequence trimmed = chimera->print(outMPI, outAccMPI); + cnames.insert(candidateSeq->getName()); if (trim) { string outputString = ">" + trimmed.getName() + "\n" + trimmed.getAligned() + "\n"; @@ -1615,10 +1973,10 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil delete candidateSeq; //report progress - if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); } + if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; } } //report progress - if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); } + if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; } int numNoParents = chimera->getNumNoParents(); if (numNoParents == num) { cout << "[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors." << endl; } @@ -1798,9 +2156,22 @@ map ChimeraSlayerCommand::sortFastaFile(string fastaFile, string na in.close(); - //read namefile + //read namefile or countfile vector nameMapCount; - int error = m->readNames(nameFile, nameMapCount, seqs); + int error; + if (hasCount) { + CountTable ct; + ct.readTable(nameFile, true, false); + + for(map::iterator it = seqs.begin(); it != seqs.end(); it++) { + int num = ct.getNumSeqs(it->first); + if (num == 0) { error = 1; } + else { + seqPriorityNode temp(num, it->second, it->first); + nameMapCount.push_back(temp); + } + } + }else { error = m->readNames(nameFile, nameMapCount, seqs); } if (m->control_pressed) { return nameAbund; } @@ -1882,4 +2253,51 @@ map ChimeraSlayerCommand::sortFastaFile(vector& thisseqs, } } /**************************************************************************************************/ +int ChimeraSlayerCommand::sortFastaFile(vector& thisseqs, map& countMap, string newFile) { + try { + vector nameVector; + + //read through fastafile and store info + map seqs; + + for (int i = 0; i < thisseqs.size(); i++) { + + if (m->control_pressed) { return 0; } + + map::iterator itCountMap = countMap.find(thisseqs[i].getName()); + + if (itCountMap == countMap.end()){ + m->control_pressed = true; + m->mothurOut("[ERROR]: " + thisseqs[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine(); + }else { + seqPriorityNode temp(itCountMap->second, thisseqs[i].getAligned(), thisseqs[i].getName()); + nameVector.push_back(temp); + } + } + + //sort by num represented + sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes); + + if (m->control_pressed) { return 0; } + + if (thisseqs.size() != nameVector.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your count file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + + ofstream out; + m->openOutputFile(newFile, out); + + //print new file in order of + for (int i = 0; i < nameVector.size(); i++) { + out << ">" << nameVector[i].name << endl << nameVector[i].seq << endl; + } + out.close(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "sortFastaFile"); + exit(1); + } +} +/**************************************************************************************************/