X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=deconvolutecommand.cpp;h=98109ef33c488a071b04ad5ada84ffcab9d2e1f8;hp=96682af26b8bcb1b62c17987c82355856cfaf919;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=0470f6d037aacb3563c3f7010708120a4a67d4e6 diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index 96682af..98109ef 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -8,108 +8,348 @@ */ #include "deconvolutecommand.h" +#include "sequence.hpp" +//********************************************************************************************************************** +vector DeconvoluteCommand::setParameters(){ + try { + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta-name",false,true,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "DeconvoluteCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string DeconvoluteCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The unique.seqs command reads a fastafile and creates a name or count file.\n"; + helpString += "It creates a file where the first column is the groupname and the second column is a list of sequence names who have the same sequence. \n"; + helpString += "If the sequence is unique the second column will just contain its name. \n"; + helpString += "The unique.seqs command parameters are fasta and name. fasta is required, unless there is a valid current fasta file.\n"; + helpString += "The unique.seqs command should be in the following format: \n"; + helpString += "unique.seqs(fasta=yourFastaFile) \n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "DeconvoluteCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** +string DeconvoluteCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "fasta") { pattern = "[filename],unique,[extension]"; } + else if (type == "name") { pattern = "[filename],names-[filename],[tag],names"; } + else if (type == "count") { pattern = "[filename],count_table-[filename],[tag],count_table"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "DeconvoluteCommand", "getOutputPattern"); + exit(1); + } +} + +//********************************************************************************************************************** +DeconvoluteCommand::DeconvoluteCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "DeconvoluteCommand", "DeconvoluteCommand"); + exit(1); + } +} /**************************************************************************************/ -DeconvoluteCommand::DeconvoluteCommand(string option) { +DeconvoluteCommand::DeconvoluteCommand(string option) { try { - globaldata = GlobalData::getInstance(); - abort = false; + abort = false; calledHelp = false; //allow user to run help - if(option == "help") { help(); abort = true; } + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} else { - //valid paramters for this command - string Array[] = {"fasta"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + vector myArray = setParameters(); - parser = new OptionParser(); - parser->parse(option, parameters); delete parser; + OptionParser parser(option); + map parameters = parser.getParameters(); - ValidParameters* validParameter = new ValidParameters(); + ValidParameters validParameter; + map::iterator it; //check to make sure all parameters are valid for command for (it = parameters.begin(); it != parameters.end(); it++) { - if (validParameter->isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } + } + + //check for required parameters - filename = validParameter->validFile(parameters, "fasta", true); - if (filename == "not open") { abort = true; } - else if (filename == "not found") { filename = ""; cout << "fasta is a required parameter for the unique.seqs command." << endl; abort = true; } - else { globaldata->setFastaFile(filename); globaldata->setFormat("fasta"); } + inFastaName = validParameter.validFile(parameters, "fasta", true); + if (inFastaName == "not open") { abort = true; } + else if (inFastaName == "not found") { + inFastaName = m->getFastaFile(); + if (inFastaName != "") { m->mothurOut("Using " + inFastaName + " as input file for the fasta parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { m->setFastaFile(inFastaName); } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += m->hasPath(inFastaName); //if user entered a file with a path then preserve it + } + + oldNameMapFName = validParameter.validFile(parameters, "name", true); + if (oldNameMapFName == "not open") { oldNameMapFName = ""; abort = true; } + else if (oldNameMapFName == "not found"){ oldNameMapFName = ""; } + else { m->setNameFile(oldNameMapFName); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (oldNameMapFName != "")) { m->mothurOut("When executing a unique.seqs command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } - delete validParameter; - } - } - catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the DeconvoluteCommand class Function DeconvoluteCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } - catch(...) { - cout << "An unknown error has occurred in the DeconvoluteCommand class function DeconvoluteCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } -} -//********************************************************************************************************************** + if (countfile == "") { + if (oldNameMapFName == "") { + vector files; files.push_back(inFastaName); + parser.getNameFile(files); + } + } + + } -void DeconvoluteCommand::help(){ - try { - cout << "The unique.seqs command reads a fastafile and creates a namesfile." << "\n"; - cout << "It creates a file where the first column is the groupname and the second column is a list of sequence names who have the same sequence. " << "\n"; - cout << "If the sequence is unique the second column will just contain its name. " << "\n"; - cout << "The unique.seqs command parameter is fasta and it is required." << "\n"; - cout << "The unique.seqs command should be in the following format: " << "\n"; - cout << "unique.seqs(fasta=yourFastaFile) " << "\n"; } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the DeconvoluteCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + m->errorOut(e, "DeconvoluteCommand", "DeconvoluteCommand"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the DeconvoluteCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } } - /**************************************************************************************/ int DeconvoluteCommand::execute() { try { - if (abort == true) { return 0; } - + if (abort == true) { if (calledHelp) { return 0; } return 2; } + //prepare filenames and open files - outputFileName = (getRootName(filename) + "names"); - outFastafile = (getRootName(filename) + "unique.fasta"); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inFastaName)); + string outNameFile = getOutputFileName("name", variables); + string outCountFile = getOutputFileName("count", variables); + variables["[extension]"] = m->getExtension(inFastaName); + string outFastaFile = getOutputFileName("fasta", variables); + + map nameMap; + map::iterator itNames; + if (oldNameMapFName != "") { + m->readNames(oldNameMapFName, nameMap); + if (oldNameMapFName == outNameFile){ + //prepare filenames and open files + map mvariables; + mvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inFastaName)); + mvariables["[tag]"] = "unique"; + outNameFile = getOutputFileName("name", mvariables); + } + } + CountTable ct; + if (countfile != "") { + ct.readTable(countfile, true, false); + if (countfile == outCountFile){ + //prepare filenames and open files + map mvariables; + mvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inFastaName)); + mvariables["[tag]"] = "unique"; + outCountFile = getOutputFileName("count", mvariables); } + } - openInputFile(filename, in); - openOutputFile(outputFileName, out); - openOutputFile(outFastafile, outFasta); + if (m->control_pressed) { return 0; } + + ifstream in; + m->openInputFile(inFastaName, in); + + ofstream outFasta; + m->openOutputFile(outFastaFile, outFasta); + + map sequenceStrings; //sequenceString -> list of names. "atgc...." -> seq1,seq2,seq3. + map::iterator itStrings; + set nameInFastaFile; //for sanity checking + set::iterator itname; + vector nameFileOrder; + int count = 0; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); outFasta.close(); m->mothurRemove(outFastaFile); return 0; } + + Sequence seq(in); + + if (seq.getName() != "") { + + //sanity checks + itname = nameInFastaFile.find(seq.getName()); + if (itname == nameInFastaFile.end()) { nameInFastaFile.insert(seq.getName()); } + else { m->mothurOut("[ERROR]: You already have a sequence named " + seq.getName() + " in your fasta file, sequence names must be unique, please correct."); m->mothurOutEndLine(); } - //constructor reads in file and store internally - fastamap = new FastaMap(); - - //two columns separated by tabs sequence name and then sequence - fastamap->readFastaFile(in); - - //print out new names file - //file contains 2 columns separated by tabs. the first column is the groupname(name of first sequence found. - //the second column is the list of names of identical sequences separated by ','. - fastamap->printNamesFile(out); - fastamap->printCondensedFasta(outFasta); - - out.close(); + itStrings = sequenceStrings.find(seq.getAligned()); + + if (itStrings == sequenceStrings.end()) { //this is a new unique sequence + //output to unique fasta file + seq.printSequence(outFasta); + + if (oldNameMapFName != "") { + itNames = nameMap.find(seq.getName()); + + if (itNames == nameMap.end()) { //namefile and fastafile do not match + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); + }else { + sequenceStrings[seq.getAligned()] = itNames->second; + nameFileOrder.push_back(seq.getAligned()); + } + }else if (countfile != "") { + ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table + sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); + }else { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } + }else { //this is a dup + if (oldNameMapFName != "") { + itNames = nameMap.find(seq.getName()); + + if (itNames == nameMap.end()) { //namefile and fastafile do not match + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); + }else { + sequenceStrings[seq.getAligned()] += "," + itNames->second; + } + }else if (countfile != "") { + int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table + if (num != 0) { //its in the table + ct.mergeCounts(itStrings->second, seq.getName()); //merges counts and saves in uniques name + } + }else { sequenceStrings[seq.getAligned()] += "," + seq.getName(); } + } + + count++; + } + + m->gobble(in); + + if(count % 1000 == 0) { m->mothurOut(toString(count) + "\t" + toString(sequenceStrings.size())); m->mothurOutEndLine(); } + } + + if(count % 1000 != 0) { m->mothurOut(toString(count) + "\t" + toString(sequenceStrings.size())); m->mothurOutEndLine(); } + + in.close(); outFasta.close(); - + + if (m->control_pressed) { m->mothurRemove(outFastaFile); return 0; } + + //print new names file + ofstream outNames; + if (countfile == "") { m->openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile); } + else { m->openOutputFile(outCountFile, outNames); ct.printHeaders(outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); } + + for (int i = 0; i < nameFileOrder.size(); i++) { + if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + itStrings = sequenceStrings.find(nameFileOrder[i]); + + if (itStrings != sequenceStrings.end()) { + if (countfile == "") { + //get rep name + int pos = (itStrings->second).find_first_of(','); + + if (pos == string::npos) { // only reps itself + outNames << itStrings->second << '\t' << itStrings->second << endl; + }else { + outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; + } + }else { ct.printSeq(outNames, itStrings->second); } + }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; } + } + outNames.close(); + + if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + //set fasta file as new current fastafile + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + return 0; } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the DeconvoluteCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } - catch(...) { - cout << "An unknown error has occurred in the DeconvoluteCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + m->errorOut(e, "DeconvoluteCommand", "execute"); exit(1); } }