X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=getoturepcommand.cpp;h=9dfc8bdeadc83c9784a42c10d66b26efcd52e8d4;hb=a8367302932de9be5434e77f6e5829d7609e2aec;hp=e9373a93cbf81ae6bcd84bdd063e01947994de6e;hpb=aa9238c0a9e6e7aa0ed8b8b606b08ad4fd7dcfe3;p=mothur.git diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index e9373a9..9dfc8bd 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -12,7 +12,7 @@ #include "readcolumn.h" #include "formatphylip.h" #include "formatcolumn.h" - +#include "sharedutilities.h" //******************************************************************************************************************** @@ -48,7 +48,7 @@ GetOTURepCommand::GetOTURepCommand(string option) { help(); abort = true; } else { //valid paramters for this command - string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","outputdir","inputdir"}; + string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -160,13 +160,7 @@ GetOTURepCommand::GetOTURepCommand(string option) { groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } - else { - //read in group map info. - groupMap = new GroupMap(groupfile); - int error = groupMap->readMap(); - if (error == 1) { delete groupMap; abort = true; } - } - + sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; } if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) { m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine(); @@ -178,6 +172,18 @@ GetOTURepCommand::GetOTURepCommand(string option) { sorted = ""; } + groups = validParameter.validFile(parameters, "groups", false); + if (groups == "not found") { groups = ""; } + else { + if (groupfile == "") { + m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine(); + abort = true; + }else { + splitAtDash(groups, Groups); + } + } + globaldata->Groups = Groups; + string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; } large = isTrue(temp); @@ -199,7 +205,7 @@ GetOTURepCommand::GetOTURepCommand(string option) { void GetOTURepCommand::help(){ try { - m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n"); + m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, groups, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n"); m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n"); m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n"); m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n"); @@ -208,6 +214,7 @@ void GetOTURepCommand::help(){ m->mothurOut("The default value for label is all labels in your inputfile.\n"); m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n"); m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n"); + m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n"); m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n"); m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n"); m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); @@ -245,7 +252,7 @@ int GetOTURepCommand::execute(){ readMatrix->read(nameMap); - if (m->control_pressed) { delete readMatrix; delete groupMap; return 0; } + if (m->control_pressed) { delete readMatrix; return 0; } //get matrix if (globaldata->gListVector != NULL) { delete globaldata->gListVector; } @@ -259,14 +266,15 @@ int GetOTURepCommand::execute(){ // via the index of a sequence in the distance matrix seqVec = vector(globaldata->gListVector->size()); for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) { - if (m->control_pressed) { delete readMatrix; delete groupMap; return 0; } + if (m->control_pressed) { delete readMatrix; return 0; } seqVec[currentCell->row][currentCell->column] = currentCell->dist; } delete matrix; delete readMatrix; + delete nameMap; - if (m->control_pressed) { delete groupMap; return 0; } + if (m->control_pressed) { return 0; } }else { //process file and set up indexes if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); } @@ -282,7 +290,7 @@ int GetOTURepCommand::execute(){ formatMatrix->read(nameMap); - if (m->control_pressed) { delete formatMatrix; delete groupMap; return 0; } + if (m->control_pressed) { delete formatMatrix; return 0; } //get matrix if (globaldata->gListVector != NULL) { delete globaldata->gListVector; } @@ -295,11 +303,12 @@ int GetOTURepCommand::execute(){ rowPositions = formatMatrix->getRowPositions(); delete formatMatrix; + delete nameMap; //openfile for getMap to use openInputFile(distFile, inRow); - if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); delete groupMap; return 0; } + if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; } } @@ -320,22 +329,28 @@ int GetOTURepCommand::execute(){ } } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); } - fasta = new FastaMap(); - - //read fastafile - fasta->readFastaFile(fastafile); - + if (m->control_pressed) { if (large) { inRow.close(); remove(distFile.c_str()); } - delete groupMap; delete fasta; return 0; + return 0; } - - //if user gave a namesfile then use it - if (namefile != "") { readNamesFile(); } + if (groupfile != "") { + //read in group map info. + groupMap = new GroupMap(groupfile); + int error = groupMap->readMap(); + if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; } + + if (Groups.size() != 0) { + SharedUtil* util = new SharedUtil(); + util->setGroups(Groups, groupMap->namesOfGroups, "getoturep"); + delete util; + } + } + //set format to list so input can get listvector globaldata->setFormat("list"); - + //read list file read = new ReadOTUFile(listfile); read->read(&*globaldata); @@ -343,14 +358,14 @@ int GetOTURepCommand::execute(){ input = globaldata->ginput; list = globaldata->gListVector; string lastLabel = list->getLabel(); - + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; if (m->control_pressed) { if (large) { inRow.close(); remove(distFile.c_str()); } - delete groupMap; delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; + delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; } while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { @@ -362,9 +377,8 @@ int GetOTURepCommand::execute(){ if (m->control_pressed) { if (large) { inRow.close(); remove(distFile.c_str()); } - if (groupfile != "") { delete groupMap; globaldata->gGroupmap = NULL; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } - delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; + delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; } processedLabels.insert(list->getLabel()); @@ -382,9 +396,8 @@ int GetOTURepCommand::execute(){ if (m->control_pressed) { if (large) { inRow.close(); remove(distFile.c_str()); } - if (groupfile != "") { delete groupMap; globaldata->gGroupmap = NULL; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } - delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; + delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; } processedLabels.insert(list->getLabel()); @@ -395,7 +408,7 @@ int GetOTURepCommand::execute(){ } lastLabel = list->getLabel(); - + delete list; list = input->getListVector(); } @@ -403,8 +416,8 @@ int GetOTURepCommand::execute(){ //output error messages about any remaining user labels bool needToRun = false; for (set::iterator it = userLabels.begin(); it != userLabels.end(); it++) { - m->mothurOut("Your file does not include the label " + *it); - if (processedLabels.count(list->getLabel()) != 1) { + m->mothurOut("Your file does not include the label " + (*it)); + if (processedLabels.count(lastLabel) != 1) { m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); needToRun = true; }else { @@ -423,9 +436,8 @@ int GetOTURepCommand::execute(){ if (m->control_pressed) { if (large) { inRow.close(); remove(distFile.c_str()); } - if (groupfile != "") { delete groupMap; globaldata->gGroupmap = NULL; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } - delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; + delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; } } @@ -438,6 +450,20 @@ int GetOTURepCommand::execute(){ globaldata->gListVector = NULL; delete input; globaldata->ginput = NULL; delete read; + + //read fastafile + fasta = new FastaMap(); + fasta->readFastaFile(fastafile); + + //if user gave a namesfile then use it + if (namefile != "") { readNamesFile(); } + + //output create and output the .rep.fasta files + map::iterator itNameFile; + for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) { + processNames(itNameFile->first, itNameFile->second); + } + delete fasta; if (groupfile != "") { delete groupMap; globaldata->gGroupmap = NULL; @@ -492,38 +518,8 @@ void GetOTURepCommand::readNamesFile() { } } //********************************************************************************************************************** -string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) { +string GetOTURepCommand::findRep(vector names) { try{ - vector names; - map groups; - map::iterator groupIt; - - //parse names into vector - string binnames = thisList->get(bin); - splitAtComma(binnames, names); - binsize = names.size(); - - //if you have a groupfile - if (groupfile != "") { - //find the groups that are in this bin - for (size_t i = 0; i < names.size(); i++) { - string groupName = groupMap->getGroup(names[i]); - if (groupName == "not found") { - m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); - groupError = true; - } else { - groups[groupName] = groupName; - } - } - - //turn the groups into a string - for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { - group += groupIt->first + "-"; - } - //rip off last dash - group = group.substr(0, group.length()-1); - }else{ group = ""; } - // if only 1 sequence in bin or processing the "unique" label, then // the first sequence of the OTU is the representative one if ((names.size() == 1) || (list->getLabel() == "unique")) { @@ -584,7 +580,7 @@ string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, i } } } - + return(names[minIndex]); } } @@ -597,58 +593,184 @@ string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, i //********************************************************************************************************************** int GetOTURepCommand::process(ListVector* processList) { try{ - string nameRep, name, sequence; + string name, sequence; + string nameRep; //create output file if (outputDir == "") { outputDir += hasPath(listfile); } - string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.fasta"; - openOutputFile(outputFileName, out); - vector reps; - outputNames.push_back(outputFileName); - + ofstream newNamesOutput; - string outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names"; - openOutputFile(outputNamesFile, newNamesOutput); - outputNames.push_back(outputNamesFile); + string outputNamesFile; + map filehandles; + + if (Groups.size() == 0) { //you don't want to use groups + outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names"; + openOutputFile(outputNamesFile, newNamesOutput); + outputNames.push_back(outputNamesFile); + outputNameFiles[outputNamesFile] = processList->getLabel(); + }else{ //you want to use groups + ofstream* temp; + for (int i=0; igetLabel() + "." + Groups[i] + ".rep.names"; + + openOutputFile(outputNamesFile, *(temp)); + outputNames.push_back(outputNamesFile); + outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i]; + } + } //for each bin in the list vector for (int i = 0; i < processList->size(); i++) { - string groups; - int binsize; - - if (m->control_pressed) { out.close(); newNamesOutput.close(); return 0; } - - nameRep = findRep(i, groups, processList, binsize); + if (m->control_pressed) { + out.close(); + if (Groups.size() == 0) { //you don't want to use groups + newNamesOutput.close(); + }else{ + for (int j=0; jcontrol_pressed) { out.close(); newNamesOutput.close(); return 0; } + string temp = processList->get(i); + vector namesInBin; + splitAtComma(temp, namesInBin); + + if (Groups.size() == 0) { + nameRep = findRep(namesInBin); + newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl; + }else{ + map > NamesInGroup; + for (int j=0; jgetGroup(namesInBin[j]); + + if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + + if (inUsersGroups(thisgroup, Groups)) { //add this name to correct group + NamesInGroup[thisgroup].push_back(namesInBin[j]); + } + } + + //get rep for each group in otu + for (int j=0; jerrorOut(e, "GetOTURepCommand", "process"); + exit(1); + } +} +//********************************************************************************************************************** +int GetOTURepCommand::processNames(string filename, string label) { + try{ + + //create output file + if (outputDir == "") { outputDir += hasPath(listfile); } + string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + label + ".rep.fasta"; + openOutputFile(outputFileName, out); + vector reps; + outputNames.push_back(outputFileName); + + ofstream out2; + string tempNameFile = filename + ".temp"; + openOutputFile(tempNameFile, out2); + + ifstream in; + openInputFile(filename, in); + + int i = 0; + while (!in.eof()) { + string rep, binnames; + in >> i >> rep >> binnames; gobble(in); + out2 << rep << '\t' << binnames << endl; - //output to new names file - newNamesOutput << nameRep << '\t' << processList->get(i) << endl; + vector names; + splitAtComma(binnames, names); + int binsize = names.size(); + + //if you have a groupfile + string group = ""; + if (groupfile != "") { + map groups; + map::iterator groupIt; + + //find the groups that are in this bin + for (size_t i = 0; i < names.size(); i++) { + string groupName = groupMap->getGroup(names[i]); + if (groupName == "not found") { + m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); + groupError = true; + } else { + groups[groupName] = groupName; + } + } + + //turn the groups into a string + for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { + group += groupIt->first + "-"; + } + //rip off last dash + group = group.substr(0, group.length()-1); + }else{ group = ""; } + //print out name and sequence for that bin - sequence = fasta->getSequence(nameRep); + string sequence = fasta->getSequence(rep); if (sequence != "not found") { if (sorted == "") { //print them out - nameRep = nameRep + "|" + toString(i+1); - nameRep = nameRep + "|" + toString(binsize); + rep = rep + "|" + toString(i+1); + rep = rep + "|" + toString(binsize); if (groupfile != "") { - nameRep = nameRep + "|" + groups; + rep = rep + "|" + group; } - out << ">" << nameRep << endl; + out << ">" << rep << endl; out << sequence << endl; }else { //save them - repStruct newRep(nameRep, i+1, binsize, groups); + repStruct newRep(rep, i+1, binsize, group); reps.push_back(newRep); } }else { - m->mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); - remove(outputFileName.c_str()); - remove(outputNamesFile.c_str()); - return 1; + m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); } } + if (sorted != "") { //then sort them and print them if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); } else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); } @@ -667,18 +789,20 @@ int GetOTURepCommand::process(ListVector* processList) { out << sequence << endl; } } - + out.close(); - newNamesOutput.close(); + out2.close(); + + rename(tempNameFile.c_str(), filename.c_str()); + return 0; } catch(exception& e) { - m->errorOut(e, "GetOTURepCommand", "process"); + m->errorOut(e, "GetOTURepCommand", "processNames"); exit(1); } } - //********************************************************************************************************************** SeqMap GetOTURepCommand::getMap(int row) { try {