X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=createdatabasecommand.cpp;h=51211395356228a5e3b9b3f554a399da3a558c30;hb=70491a12902e89b85cfa6b44a7b7fbe066ee2ac1;hp=57d5264b8b151d4175b6754ab2ee75732e1e597e;hpb=ee8403d4eb5760187d62b42a9cf4272de8fc0ec4;p=mothur.git diff --git a/createdatabasecommand.cpp b/createdatabasecommand.cpp index 57d5264..5121139 100644 --- a/createdatabasecommand.cpp +++ b/createdatabasecommand.cpp @@ -12,14 +12,15 @@ //********************************************************************************************************************** vector CreateDatabaseCommand::setParameters(){ try { - CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("repname", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); - CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pcontaxonomy); - CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none","database",false,true,true); parameters.push_back(pfasta); + CommandParameter pname("repname", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pname); + CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pcontaxonomy); + CommandParameter plist("list", "InputTypes", "", "", "ListShared", "ListShared", "none","",false,false,true); parameters.push_back(plist); + CommandParameter pshared("shared", "InputTypes", "", "", "ListShared", "ListShared", "none","",false,false,true); parameters.push_back(pshared); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pgroup); + CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -34,8 +35,8 @@ vector CreateDatabaseCommand::setParameters(){ string CreateDatabaseCommand::getHelpString(){ try { string helpString = ""; - helpString += "The create.database command reads a listfile, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, and creates a database file.\n"; - helpString += "The create.database command parameters are repfasta, list, repname, contaxonomy, group and label. List, repfasta, repnames, and contaxonomy are required.\n"; + helpString += "The create.database command reads a list file or a shared file, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, and creates a database file.\n"; + helpString += "The create.database command parameters are repfasta, list, shared, repname, contaxonomy, group and label. List, repfasta, repnames, and contaxonomy are required.\n"; helpString += "The repfasta file is fasta file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n"; helpString += "The repname file is the name file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n"; helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile).\n"; @@ -54,6 +55,21 @@ string CreateDatabaseCommand::getHelpString(){ } } //********************************************************************************************************************** +string CreateDatabaseCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "database") { pattern = "[filename],database"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "CreateDatabaseCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** CreateDatabaseCommand::CreateDatabaseCommand(){ try { abort = true; calledHelp = true; @@ -138,6 +154,14 @@ CreateDatabaseCommand::CreateDatabaseCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("shared"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["shared"] = inputDir + it->second; } + } } @@ -146,14 +170,33 @@ CreateDatabaseCommand::CreateDatabaseCommand(string option) { //check for required parameters listfile = validParameter.validFile(parameters, "list", true); - if (listfile == "not found") { - //if there is a current list file, use it + if (listfile == "not found") { listfile = ""; } + else if (listfile == "not open") { listfile = ""; abort = true; } + else { m->setListFile(listfile); } + + sharedfile = validParameter.validFile(parameters, "shared", true); + if (sharedfile == "not found") { sharedfile = ""; } + else if (sharedfile == "not open") { sharedfile = ""; abort = true; } + else { m->setSharedFile(sharedfile); } + + if ((sharedfile == "") && (listfile == "")) { + //is there are current file available for either of these? + //give priority to list, then shared listfile = m->getListFile(); if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; } + else { + sharedfile = m->getSharedFile(); + if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("No valid current files. You must provide a shared or list file before you can use the create.database command."); m->mothurOutEndLine(); + abort = true; + } + } } - else if (listfile == "not open") { abort = true; } - else { m->setListFile(listfile); } + else if ((sharedfile != "") && (listfile != "")) { m->mothurOut("When executing a create.database command you must enter ONLY ONE of the following: shared or list."); m->mothurOutEndLine(); abort = true; } + + if (sharedfile != "") { if (outputDir == "") { outputDir = m->hasPath(sharedfile); } } + else { if (outputDir == "") { outputDir = m->hasPath(listfile); } } contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true); if (contaxonomyfile == "not found") { //if there is a current list file, use it @@ -198,7 +241,8 @@ int CreateDatabaseCommand::execute(){ //taxonomies holds the taxonomy info for each Otu //classifyOtuSizes holds the size info of each Otu to help with error checking vector taxonomies; - vector classifyOtuSizes = readTax(taxonomies); + vector otuLabels; + vector classifyOtuSizes = readTax(taxonomies, otuLabels); if (m->control_pressed) { return 0; } @@ -209,7 +253,7 @@ int CreateDatabaseCommand::execute(){ //names redundants to uniques. backwards to how we normally do it, but each bin is the list file will be a key entry in the map. map repNames; - int numUniqueNamesFile = m->readNames(repnamesfile, repNames); + int numUniqueNamesFile = m->readNames(repnamesfile, repNames, 1); //are there the same number of otus in the fasta and name files if (repOtusSizes.size() != numUniqueNamesFile) { m->mothurOut("[ERROR]: you have " + toString(numUniqueNamesFile) + " unique seqs in your repname file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file. These should match.\n"); m->control_pressed = true; } @@ -230,86 +274,131 @@ int CreateDatabaseCommand::execute(){ if (m->control_pressed) { return 0; } - //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile - ListVector* list = getList(); - - if (m->control_pressed) { delete list; return 0; } - - GroupMap* groupmap = NULL; - if (groupfile != "") { - groupmap = new GroupMap(groupfile); - groupmap->readMap(); - } - - if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; } - if (outputDir == "") { outputDir += m->hasPath(listfile); } - string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + "database"; + map variables; + if (listfile != "") { variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile)); } + else { variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)); } + string outputFileName = getOutputFileName("database", variables); outputNames.push_back(outputFileName); outputTypes["database"].push_back(outputFileName); ofstream out; m->openOutputFile(outputFileName, out); string header = "OTUNumber\tAbundance\t"; - if (groupfile != "") { - header = "OTUNumber\t"; - for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; } - } - header += "repSeqName\trepSeq\tOTUConTaxonomy"; - out << header << endl; + - for (int i = 0; i < list->getNumBins(); i++) { + if (listfile != "") { + //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile + ListVector* list = getList(); - if (m->control_pressed) { break; } - - out << (i+1) << '\t'; + if (otuLabels.size() != list->getNumBins()) { + m->mothurOut("[ERROR]: you have " + toString(otuLabels.size()) + " otus in your contaxonomy file, but your list file has " + toString(list->getNumBins()) + " otus. These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true; } - vector binNames; - string bin = list->get(i); + if (m->control_pressed) { delete list; return 0; } - map::iterator it = repNames.find(bin); - if (it == repNames.end()) { - m->mothurOut("[ERROR: OTU " + toString(i+1) + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + GroupMap* groupmap = NULL; + if (groupfile != "") { + groupmap = new GroupMap(groupfile); + groupmap->readMap(); } - m->splitAtComma(bin, binNames); + if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; } - //sanity check - if (binNames.size() != classifyOtuSizes[i]) { - m->mothurOut("[ERROR: OTU " + toString(i+1) + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + if (groupfile != "") { + header = "OTUNumber\t"; + for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; } } + header += "repSeqName\trepSeq\tOTUConTaxonomy"; + out << header << endl; - //output abundances - if (groupfile != "") { - string groupAbunds = ""; - map counts; - //initialize counts to 0 - for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; } + for (int i = 0; i < list->getNumBins(); i++) { + + if (m->control_pressed) { break; } + + out << otuLabels[i] << '\t'; + + vector binNames; + string bin = list->get(i); + + map::iterator it = repNames.find(bin); + if (it == repNames.end()) { + m->mothurOut("[ERROR: OTU " + otuLabels[i] + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + } + + m->splitAtComma(bin, binNames); - //find abundances by group - bool error = false; - for (int j = 0; j < binNames.size(); j++) { - string group = groupmap->getGroup(binNames[j]); - if (group == "not found") { - m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n"); - error = true; - }else { counts[group]++; } + //sanity check + if (binNames.size() != classifyOtuSizes[i]) { + m->mothurOut("[ERROR: OTU " + otuLabels[i] + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; } - //output counts - for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t'; } + //output abundances + if (groupfile != "") { + string groupAbunds = ""; + map counts; + //initialize counts to 0 + for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; } + + //find abundances by group + bool error = false; + for (int j = 0; j < binNames.size(); j++) { + string group = groupmap->getGroup(binNames[j]); + if (group == "not found") { + m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n"); + error = true; + }else { counts[group]++; } + } + + //output counts + for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t'; } + + if (error) { m->control_pressed = true; } + }else { out << binNames.size() << '\t'; } - if (error) { m->control_pressed = true; } - }else { out << binNames.size() << '\t'; } + //output repSeq + out << it->second << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl; + } + - //output repSeq - out << it->second << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl; + delete list; + if (groupfile != "") { delete groupmap; } + + }else { + vector lookup = getShared(); + + header = "OTUNumber\t"; + for (int i = 0; i < lookup.size(); i++) { header += lookup[i]->getGroup() + '\t'; } + header += "repSeqName\trepSeq\tOTUConTaxonomy"; + out << header << endl; + + for (int h = 0; h < lookup[0]->getNumBins(); h++) { + + if (m->control_pressed) { break; } + + int index = findIndex(otuLabels, m->currentBinLabels[h]); + if (index == -1) { m->mothurOut("[ERROR]: " + m->currentBinLabels[h] + " is not in your constaxonomy file, aborting.\n"); m->control_pressed = true; } + + if (m->control_pressed) { break; } + + out << otuLabels[index] << '\t'; + + int totalAbund = 0; + for (int i = 0; i < lookup.size(); i++) { + int abund = lookup[i]->getAbundance(h); + totalAbund += abund; + out << abund << '\t'; + } + + //sanity check + if (totalAbund != classifyOtuSizes[index]) { + m->mothurOut("[WARNING]: OTU " + m->currentBinLabels[h] + " contains " + toString(totalAbund) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[index]) + ". Make sure you are using files for the same distance.\n"); //m->control_pressed = true; break; + } + + //output repSeq + out << seqs[index].getName() << '\t' << seqs[index].getAligned() << '\t' << taxonomies[index] << endl; + } } out.close(); - - delete list; - if (groupfile != "") { delete groupmap; } - if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } m->mothurOutEndLine(); @@ -326,7 +415,21 @@ int CreateDatabaseCommand::execute(){ } } //********************************************************************************************************************** -vector CreateDatabaseCommand::readTax(vector& taxonomies){ +int CreateDatabaseCommand::findIndex(vector& otuLabels, string label){ + try { + int index = -1; + for (int i = 0; i < otuLabels.size(); i++) { + if (otuLabels[i] == label) { index = i; break; } + } + return index; + } + catch(exception& e) { + m->errorOut(e, "CreateDatabaseCommand", "findIndex"); + exit(1); + } +} +//********************************************************************************************************************** +vector CreateDatabaseCommand::readTax(vector& taxonomies, vector& otuLabels){ try { vector sizes; @@ -348,6 +451,7 @@ vector CreateDatabaseCommand::readTax(vector& taxonomies){ sizes.push_back(size); taxonomies.push_back(tax); + otuLabels.push_back(otu); } in.close(); @@ -472,5 +576,81 @@ ListVector* CreateDatabaseCommand::getList(){ } } //********************************************************************************************************************** +vector CreateDatabaseCommand::getShared(){ + try { + InputData input(sharedfile, "sharedfile"); + vector lookup = input.getSharedRAbundVectors(); + string lastLabel = lookup[0]->getLabel(); + + if (label == "") { label = lastLabel; return lookup; } + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set labels; labels.insert(label); + set processedLabels; + set userLabels = labels; + + //as long as you are not at the end of the file or done wih the lines you want + while((lookup[0] != NULL) && (userLabels.size() != 0)) { + if (m->control_pressed) { return lookup; } + + if(labels.count(lookup[0]->getLabel()) == 1){ + processedLabels.insert(lookup[0]->getLabel()); + userLabels.erase(lookup[0]->getLabel()); + break; + } + + if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = lookup[0]->getLabel(); + + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input.getSharedRAbundVectors(lastLabel); + + processedLabels.insert(lookup[0]->getLabel()); + userLabels.erase(lookup[0]->getLabel()); + + //restore real lastlabel to save below + lookup[0]->setLabel(saveLabel); + break; + } + + lastLabel = lookup[0]->getLabel(); + + //get next line to process + //prevent memory leak + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input.getSharedRAbundVectors(); + } + + + if (m->control_pressed) { return lookup; } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input.getSharedRAbundVectors(lastLabel); + } + + return lookup; + } + catch(exception& e) { + m->errorOut(e, "CreateDatabaseCommand", "getList"); + exit(1); + } +} + +//**********************************************************************************************************************