From 2ea328ecaed9d038edca048887161e527523b671 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 14 May 2013 08:14:36 -0400 Subject: [PATCH] added count file to create.database command. corrected for change to get.oturep changing order of members in otu to reflect the names file format. That change caused the create.database command to fail to find otus --- createdatabasecommand.cpp | 141 ++++++++++++++++++++++++++++++++------ createdatabasecommand.h | 2 +- 2 files changed, 120 insertions(+), 23 deletions(-) diff --git a/createdatabasecommand.cpp b/createdatabasecommand.cpp index 5121139..58799e7 100644 --- a/createdatabasecommand.cpp +++ b/createdatabasecommand.cpp @@ -13,11 +13,12 @@ vector CreateDatabaseCommand::setParameters(){ try { CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none","database",false,true,true); parameters.push_back(pfasta); - CommandParameter pname("repname", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pname); + CommandParameter pname("repname", "InputTypes", "", "", "NameCount", "NameCount", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup); CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pcontaxonomy); CommandParameter plist("list", "InputTypes", "", "", "ListShared", "ListShared", "none","",false,false,true); parameters.push_back(plist); CommandParameter pshared("shared", "InputTypes", "", "", "ListShared", "ListShared", "none","",false,false,true); parameters.push_back(pshared); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pgroup); CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -35,17 +36,18 @@ vector CreateDatabaseCommand::setParameters(){ string CreateDatabaseCommand::getHelpString(){ try { string helpString = ""; - helpString += "The create.database command reads a list file or a shared file, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, and creates a database file.\n"; - helpString += "The create.database command parameters are repfasta, list, shared, repname, contaxonomy, group and label. List, repfasta, repnames, and contaxonomy are required.\n"; + helpString += "The create.database command reads a list file or a shared file, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, or count file and creates a database file.\n"; + helpString += "The create.database command parameters are repfasta, list, shared, repname, contaxonomy, group, count and label. List, repfasta, repnames or count, and contaxonomy are required.\n"; helpString += "The repfasta file is fasta file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n"; helpString += "The repname file is the name file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n"; - helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile).\n"; + helpString += "The count file is the count file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, count=yourCountFile). If it includes group info, mothur will give you the abundance breakdown by group. \n"; + helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile, name=yourNameFile).\n"; helpString += "The group file is optional and will just give you the abundance breakdown by group.\n"; helpString += "The label parameter allows you to specify a label to be used from your listfile.\n"; helpString += "NOTE: Make SURE the repfasta, repnames and contaxonomy are for the same label as the listfile.\n"; helpString += "The create.database command should be in the following format: \n"; helpString += "create.database(repfasta=yourFastaFileFromGetOTURep, repname=yourNameFileFromGetOTURep, contaxonomy=yourConTaxFileFromClassifyOTU, list=yourListFile) \n"; - helpString += "Example: create.database(repfasta=final.an.0.03.rep.fasta, name=final.an.0.03.rep.names, list=fina.an.list, label=0.03, contaxonomy=final.an.0.03.cons.taxonomy) \n"; + helpString += "Example: create.database(repfasta=final.an.0.03.rep.fasta, repname=final.an.0.03.rep.names, list=final.an.list, label=0.03, contaxonomy=final.an.0.03.cons.taxonomy) \n"; helpString += "Note: No spaces between parameter labels (i.e. repfasta), '=' and parameters (i.e.yourFastaFileFromGetOTURep).\n"; return helpString; } @@ -155,6 +157,14 @@ CreateDatabaseCommand::CreateDatabaseCommand(string option) { if (path == "") { parameters["group"] = inputDir + it->second; } } + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } + it = parameters.find("shared"); //user has given a template file if(it != parameters.end()){ @@ -211,10 +221,23 @@ CreateDatabaseCommand::CreateDatabaseCommand(string option) { else if (repfastafile == "not open") { repfastafile = ""; abort = true; } repnamesfile = validParameter.validFile(parameters, "repname", true); - if (repnamesfile == "not found") { //if there is a current list file, use it - repnamesfile = ""; m->mothurOut("The repnames parameter is required, aborting."); m->mothurOutEndLine(); abort = true; - } + if (repnamesfile == "not found") { repnamesfile = ""; } else if (repnamesfile == "not open") { repnamesfile = ""; abort = true; } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not found") { countfile = ""; } + else if (countfile == "not open") { countfile = ""; abort = true; } + + if ((countfile == "") && (repnamesfile == "")) { + //if there is a current name file, use it, else look for current count file + string repnamesfile = m->getNameFile(); + if (repnamesfile != "") { m->mothurOut("Using " + repnamesfile + " as input file for the repname parameter."); m->mothurOutEndLine(); } + else { + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("[ERROR]: You must provide a count or repname file."); m->mothurOutEndLine(); abort = true; } + } + } groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { groupfile = ""; abort = true; } @@ -253,7 +276,32 @@ int CreateDatabaseCommand::execute(){ //names redundants to uniques. backwards to how we normally do it, but each bin is the list file will be a key entry in the map. map repNames; - int numUniqueNamesFile = m->readNames(repnamesfile, repNames, 1); + map nameMap; + int numUniqueNamesFile = 0; + CountTable ct; + if (countfile == "") { + numUniqueNamesFile = m->readNames(repnamesfile, repNames, 1); + //the repnames file does not have the same order as the list file bins so we need to sort and reassemble for the search below + map tempRepNames; + for (map::iterator it = repNames.begin(); it != repNames.end();) { + string bin = it->first; + vector temp; + m->splitAtChar(bin, temp, ','); + sort(temp.begin(), temp.end()); + bin = ""; + for (int i = 0; i < temp.size()-1; i++) { + bin += temp[i] + ','; + } + bin += temp[temp.size()-1]; + tempRepNames[bin] = it->second; + repNames.erase(it++); + } + repNames = tempRepNames; + }else { + ct.readTable(countfile, true); + numUniqueNamesFile = ct.getNumUniqueSeqs(); + nameMap = ct.getNameMap(); + } //are there the same number of otus in the fasta and name files if (repOtusSizes.size() != numUniqueNamesFile) { m->mothurOut("[ERROR]: you have " + toString(numUniqueNamesFile) + " unique seqs in your repname file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file. These should match.\n"); m->control_pressed = true; } @@ -307,6 +355,11 @@ int CreateDatabaseCommand::execute(){ if (groupfile != "") { header = "OTUNumber\t"; for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; } + }else if (countfile != "") { + if (ct.hasGroupInfo()) { + header = "OTUNumber\t"; + for (int i = 0; i < ct.getNamesOfGroups().size(); i++) { header += (ct.getNamesOfGroups())[i] + '\t'; } + } } header += "repSeqName\trepSeq\tOTUConTaxonomy"; out << header << endl; @@ -319,17 +372,46 @@ int CreateDatabaseCommand::execute(){ vector binNames; string bin = list->get(i); - - map::iterator it = repNames.find(bin); - if (it == repNames.end()) { - m->mothurOut("[ERROR: OTU " + otuLabels[i] + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; - } - m->splitAtComma(bin, binNames); - //sanity check - if (binNames.size() != classifyOtuSizes[i]) { - m->mothurOut("[ERROR: OTU " + otuLabels[i] + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + string seqRepName = ""; + int numSeqsRep = 0; + + if (countfile == "") { + sort(binNames.begin(), binNames.end()); + bin = ""; + for (int i = 0; i < binNames.size()-1; i++) { + bin += binNames[i] + ','; + } + bin += binNames[binNames.size()-1]; + map::iterator it = repNames.find(bin); + + if (it == repNames.end()) { + m->mothurOut("[ERROR: OTU " + otuLabels[i] + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + }else { seqRepName = it->second; numSeqsRep = binNames.size(); } + + //sanity check + if (binNames.size() != classifyOtuSizes[i]) { + m->mothurOut("[ERROR: OTU " + otuLabels[i] + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + } + }else { + //find rep sequence in bin + for (int j = 0; j < binNames.size(); j++) { + map::iterator itNameMap = nameMap.find(binNames[j]); //if you are in the counttable you must be the rep. because get.oturep with a countfile only includes the rep sequences in the rep.count file. + if (itNameMap != nameMap.end()) { + seqRepName = itNameMap->first; + numSeqsRep = itNameMap->second; + j += binNames.size(); + } + } + + if (seqRepName == "") { + m->mothurOut("[ERROR: OTU " + otuLabels[i] + " is not in the count file. Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + } + + if (numSeqsRep != classifyOtuSizes[i]) { + m->mothurOut("[ERROR: OTU " + otuLabels[i] + " contains " + toString(numSeqsRep) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + } } //output abundances @@ -353,10 +435,15 @@ int CreateDatabaseCommand::execute(){ for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t'; } if (error) { m->control_pressed = true; } - }else { out << binNames.size() << '\t'; } + }else if (countfile != "") { + if (ct.hasGroupInfo()) { + vector groupCounts = ct.getGroupCounts(seqRepName); + for (int j = 0; j < groupCounts.size(); j++) { out << groupCounts[j] << '\t'; } + }else { out << numSeqsRep << '\t'; } + }else { out << numSeqsRep << '\t'; } //output repSeq - out << it->second << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl; + out << seqRepName << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl; } @@ -471,6 +558,7 @@ vector CreateDatabaseCommand::readFasta(vector& seqs){ ifstream in; m->openInputFile(repfastafile, in); + set sanity; while (!in.eof()) { if (m->control_pressed) { break; } @@ -481,11 +569,20 @@ vector CreateDatabaseCommand::readFasta(vector& seqs){ //the binInfo should look like - binNumber|size ie. 1|200 if it is binNumber|size|group then the user gave us the wrong repfasta file vector info; m->splitAtChar(binInfo, info, '|'); - if (info.size() != 2) { m->mothurOut("[ERROR]: your repfasta file is not the right format. The create database command is designed to be used with the output from get.oturep. When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n"); m->control_pressed = true; break;} + //if (info.size() != 2) { m->mothurOut("[ERROR]: your repfasta file is not the right format. The create database command is designed to be used with the output from get.oturep. When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n"); m->control_pressed = true; break;} int size = 0; m->mothurConvert(info[1], size); + int binNumber = 0; + string temp = ""; + for (int i = 0; i < info[0].size(); i++) { if (isspace(info[0][i])) {;}else{temp +=info[0][i]; } } + m->mothurConvert(temp, binNumber); + set::iterator it = sanity.find(binNumber); + if (it != sanity.end()) { + m->mothurOut("[ERROR]: your repfasta file is not the right format. The create database command is designed to be used with the output from get.oturep. When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n"); m->control_pressed = true; break; + }else { sanity.insert(binNumber); } + sizes.push_back(size); seqs.push_back(seq); } diff --git a/createdatabasecommand.h b/createdatabasecommand.h index ca47b7a..79b5646 100644 --- a/createdatabasecommand.h +++ b/createdatabasecommand.h @@ -35,7 +35,7 @@ public: private: bool abort; - string sharedfile, listfile, groupfile, repfastafile, repnamesfile, contaxonomyfile, label, outputDir; + string sharedfile, listfile, groupfile, repfastafile, repnamesfile, contaxonomyfile, label, outputDir, countfile; vector outputNames; -- 2.39.2