From: westcott Date: Tue, 9 Nov 2010 17:40:05 +0000 (+0000) Subject: parsimony calculator changed order of groups to reflect change in shared Utils made... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=4ad32478c7b1bbb464c839a6e133f5b517a3efec parsimony calculator changed order of groups to reflect change in shared Utils made to make output of summary.shared match unifrac commands --- diff --git a/fastamap.cpp b/fastamap.cpp index 25860e1..bf55493 100644 --- a/fastamap.cpp +++ b/fastamap.cpp @@ -64,11 +64,13 @@ void FastaMap::readFastaFile(string inFastaFile, string oldNameFileName){ //prin m->openInputFile(oldNameFileName, oldNameFile); map oldNameMap; + map::iterator itName; string name, list; while(!oldNameFile.eof()){ if (m->control_pressed) { break; } - oldNameFile >> name >> list; + oldNameFile >> name; m->gobble(oldNameFile); + oldNameFile >> list; oldNameMap[name] = list; m->gobble(oldNameFile); } @@ -87,6 +89,10 @@ void FastaMap::readFastaFile(string inFastaFile, string oldNameFileName){ //prin if(currSeq.getIsAligned()) { sequence = currSeq.getAligned(); } else { sequence = currSeq.getUnaligned(); } + itName = seqmap.find(name); + if (itName == seqmap.end()) { seqmap[name] = sequence; } + else { m->mothurOut("You already have a sequence named " + name + ", sequence names must be unique, please correct."); m->mothurOutEndLine(); } + seqmap[name] = sequence; map::iterator it = data.find(sequence); if (it == data.end()) { //it's unique. diff --git a/listseqscommand.cpp b/listseqscommand.cpp index b4b4674..1488cda 100644 --- a/listseqscommand.cpp +++ b/listseqscommand.cpp @@ -367,7 +367,7 @@ int ListSeqsCommand::readGroup(){ if (m->control_pressed) { in.close(); return 0; } - in >> name; //read from first column + in >> name; m->gobble(in); //read from first column in >> group; //read from second column names.push_back(name); diff --git a/mothur b/mothur index e33ddcd..1b53d19 100755 Binary files a/mothur and b/mothur differ diff --git a/mothurout.cpp b/mothurout.cpp index 9ec847c..bd1a098 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -522,7 +522,7 @@ string MothurOut::getFullPathName(string fileName){ newFileName = homeDir + fileName.substr(fileName.find("~")+1); return newFileName; }else { //find path - if (path.rfind("./") == -1) { return fileName; } //already complete name + if (path.rfind("./") == string::npos) { return fileName; } //already complete name else { newFileName = fileName.substr(fileName.rfind("./")+2); } //save the complete part of the name //char* cwdpath = new char[1024]; @@ -542,7 +542,7 @@ string MothurOut::getFullPathName(string fileName){ //break apart the current working directory vector dirs; - while (simpleCWD.find_first_of('/') != -1) { + while (simpleCWD.find_first_of('/') != string::npos) { string dir = simpleCWD.substr(0,simpleCWD.find_first_of('/')); simpleCWD = simpleCWD.substr(simpleCWD.find_first_of('/')+1, simpleCWD.length()); dirs.push_back(dir); @@ -553,7 +553,7 @@ string MothurOut::getFullPathName(string fileName){ int index = dirs.size()-1; - while((pos = path.rfind("./")) != -1) { //while you don't have a complete path + while((pos = path.rfind("./")) != string::npos) { //while you don't have a complete path if (pos == 0) { break; //you are at the end }else if (path[(pos-1)] == '.') { //you want your parent directory ../ path = path.substr(0, pos-1); @@ -573,12 +573,12 @@ string MothurOut::getFullPathName(string fileName){ return newFileName; } #else - if (path.find("~") != -1) { //go to home directory + if (path.find("~") != string::npos) { //go to home directory string homeDir = getenv ("HOMEPATH"); newFileName = homeDir + fileName.substr(fileName.find("~")+1); return newFileName; }else { //find path - if (path.rfind(".\\") == -1) { return fileName; } //already complete name + if (path.rfind(".\\") == string::npos) { return fileName; } //already complete name else { newFileName = fileName.substr(fileName.rfind(".\\")+2); } //save the complete part of the name char *cwdpath = NULL; @@ -599,7 +599,7 @@ string MothurOut::getFullPathName(string fileName){ int index = dirs.size()-1; - while((pos = path.rfind(".\\")) != -1) { //while you don't have a complete path + while((pos = path.rfind(".\\")) != string::npos) { //while you don't have a complete path if (pos == 0) { break; //you are at the end }else if (path[(pos-1)] == '.') { //you want your parent directory ../ path = path.substr(0, pos-1); @@ -658,7 +658,7 @@ int MothurOut::openInputFile(string fileName, ifstream& fileHandle, string m){ fileHandle.open(completeFileName.c_str()); if(!fileHandle) { - mothurOut("[ERROR]: Could not open " + completeFileName); mothurOutEndLine(); + //mothurOut("[ERROR]: Could not open " + completeFileName); mothurOutEndLine(); return 1; }else { //check for blank file diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp index 59126fb..f0a7abc 100644 --- a/parsefastaqcommand.cpp +++ b/parsefastaqcommand.cpp @@ -175,7 +175,7 @@ int ParseFastaQCommand::execute(){ if (qual == "") { m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; break; } //sanity check sequence length and number of quality scores match - if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; break; } + if (name2 != "") { if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; break; } } if (qual.length() != sequence.length()) { m->mothurOut("[ERROR]: lengths do not match. read " + toString(sequence.length()) + " characters for fasta and " + toString(qual.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; break; } //convert quality scores diff --git a/parsimony.cpp b/parsimony.cpp index 0487822..efc7d3a 100644 --- a/parsimony.cpp +++ b/parsimony.cpp @@ -34,7 +34,7 @@ EstOutput Parsimony::getValues(Tree* t) { int count = 0; for (int a=0; agetGroupFile()); - groupmap->readMap(); + groupmap->readMap(); int hold; string inputData; diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index 036341f..3def1d1 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -8,6 +8,7 @@ */ #include "subsamplecommand.h" +#include "sharedutilities.h" //********************************************************************************************************************** vector SubSampleCommand::getValidParameters(){ @@ -226,6 +227,9 @@ SubSampleCommand::SubSampleCommand(string option) { if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) { m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; } + if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { + m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } + } } @@ -270,10 +274,16 @@ int SubSampleCommand::execute(){ if (abort == true) { return 0; } if (sharedfile != "") { getSubSampleShared(); } - //if (listfile != "") { getSubSampleList(); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } + if (listfile != "") { getSubSampleList(); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } //if (rabund != "") { getSubSampleRabund(); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } //if (sabundfile != "") { getSubSampleSabund(); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } //if (fastafile != "") { getSubSampleFasta(); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); return 0; } } + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -302,31 +312,31 @@ int SubSampleCommand::getSubSampleShared() { InputData* input = new InputData(sharedfile, "sharedfile"); vector lookup = input->getSharedRAbundVectors(); string lastLabel = lookup[0]->getLabel(); - + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; - - + + //as long as you are not at the end of the file or done wih the lines you want while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { - if (m->control_pressed) { out.close(); return 0; } - + if (m->control_pressed) { delete input; out.close(); return 0; } + if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){ - + m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); processShared(lookup, out); - + processedLabels.insert(lookup[0]->getLabel()); userLabels.erase(lookup[0]->getLabel()); } if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { string saveLabel = lookup[0]->getLabel(); - + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } - + lookup = input->getSharedRAbundVectors(lastLabel); m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); @@ -342,14 +352,14 @@ int SubSampleCommand::getSubSampleShared() { lastLabel = lookup[0]->getLabel(); //prevent memory leak for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; } - + //get next line to process lookup = input->getSharedRAbundVectors(); } if (m->control_pressed) { out.close(); return 0; } - + //output error messages about any remaining user labels set::iterator it; bool needToRun = false; @@ -362,7 +372,7 @@ int SubSampleCommand::getSubSampleShared() { m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); } } - + //run last label if you need to if (needToRun == true) { for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } } @@ -374,11 +384,12 @@ int SubSampleCommand::getSubSampleShared() { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } - + + delete input; out.close(); - + return 0; - + } catch(exception& e) { m->errorOut(e, "SubSampleCommand", "getSubSampleShared"); @@ -389,7 +400,7 @@ int SubSampleCommand::getSubSampleShared() { int SubSampleCommand::processShared(vector& thislookup, ofstream& out) { try { - if (pickedGroups) { eliminateZeroOTUS(thislookup); } + //if (pickedGroups) { eliminateZeroOTUS(thislookup); } if (size == 0) { //user has not set size, set size = smallest samples size size = thislookup[0]->getNumSeqs(); @@ -403,11 +414,11 @@ int SubSampleCommand::processShared(vector& thislookup, ofs int numBins = thislookup[0]->getNumBins(); for (int i = 0; i < thislookup.size(); i++) { int thisSize = thislookup[i]->getNumSeqs(); - + if (thisSize != size) { string thisgroup = thislookup[i]->getGroup(); - + OrderVector* order = new OrderVector(); for(int p=0;pgetAbundance(p);j++){ @@ -425,6 +436,9 @@ int SubSampleCommand::processShared(vector& thislookup, ofs for (int j = 0; j < size; j++) { + + if (m->control_pressed) { delete order; return 0; } + //get random number to sample from order between 0 and thisSize-1. int myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1)); @@ -440,6 +454,8 @@ int SubSampleCommand::processShared(vector& thislookup, ofs //subsampling may have created some otus with no sequences in them eliminateZeroOTUS(thislookup); + if (m->control_pressed) { return 0; } + for (int i = 0; i < thislookup.size(); i++) { out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t'; thislookup[i]->print(out); @@ -449,7 +465,232 @@ int SubSampleCommand::processShared(vector& thislookup, ofs } catch(exception& e) { - m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS"); + m->errorOut(e, "SubSampleCommand", "processShared"); + exit(1); + } +} +//********************************************************************************************************************** +int SubSampleCommand::getSubSampleList() { + try { + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(listfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + "subsample" + m->getExtension(listfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName); + + InputData* input; + //if you have a groupfile you want to read a shared list + if (groupfile != "") { + + GroupMap* groupMap = new GroupMap(groupfile); + groupMap->readMap(); + + //takes care of user setting groupNames that are invalid or setting groups=all + SharedUtil* util = new SharedUtil(); + util->setGroups(Groups, groupMap->namesOfGroups); + delete util; + + //create outputfiles + string groupOutputDir = outputDir; + if (outputDir == "") { groupOutputDir += m->hasPath(groupfile); } + string groupOutputFileName = groupOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "subsample" + m->getExtension(groupfile); + + ofstream outGroup; + m->openOutputFile(groupOutputFileName, outGroup); + outputTypes["group"].push_back(groupOutputFileName); outputNames.push_back(groupOutputFileName); + + globaldata->setGroupFile(groupfile); //shared list needs this + + input = new InputData(listfile, "list"); + ListVector* list = input->getListVector(); + string lastLabel = list->getLabel(); + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; + + //file mismatch quit + if (list->getNumSeqs() != groupMap->getNumSeqs()) { + m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + m->mothurOutEndLine(); + delete groupMap; + delete list; + delete input; + out.close(); + outGroup.close(); + return 0; + } + + //as long as you are not at the end of the file or done wih the lines you want + while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if (m->control_pressed) { delete list; delete input; delete groupMap; out.close(); outGroup.close(); return 0; } + + if(allLines == 1 || labels.count(list->getLabel()) == 1){ + + m->mothurOut(list->getLabel()); m->mothurOutEndLine(); + + processListGroup(list, groupMap, out, outGroup); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + } + + if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = list->getLabel(); + + delete list; + + list = input->getListVector(lastLabel); + m->mothurOut(list->getLabel()); m->mothurOutEndLine(); + + processListGroup(list, groupMap, out, outGroup); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + + //restore real lastlabel to save below + list->setLabel(saveLabel); + } + + lastLabel = list->getLabel(); + + delete list; list = NULL; + + //get next line to process + list = input->getListVector(); + } + + + if (m->control_pressed) { if (list != NULL) { delete list; } delete input; delete groupMap; out.close(); outGroup.close(); return 0; } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (list != NULL) { delete list; } + + list = input->getListVector(lastLabel); + + m->mothurOut(list->getLabel()); m->mothurOutEndLine(); + + processListGroup(list, groupMap, out, outGroup); + + delete list; list = NULL; + } + + out.close(); outGroup.close(); + if (list != NULL) { delete list; } + delete groupMap; + + }else { + //need to complete + } + + + delete input; + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSubSampleList"); + exit(1); + } +} +//********************************************************************************************************************** +int SubSampleCommand::processListGroup(ListVector*& list, GroupMap*& groupMap, ofstream& out, ofstream& outGroup) { + try { + + if (size == 0) { //user has not set size, set size = 10% samples size + size = int (list->getNumSeqs() * 0.10); + } + + int numBins = list->getNumBins(); + int thisSize = list->getNumSeqs(); + + if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); + size = thisSize; + } + + vector seqs; + for (int i = 0; i < numBins; i++) { + string names = list->get(i); + + //parse names + string individual = ""; + int length = names.length(); + for(int j=0;jsetLabel(list->getLabel()); + + delete list; + list = temp; + + set alreadySelected; //dont want repeat sequence names added + alreadySelected.insert(-1); + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return 0; } + + //get random sequence to add, making sure we have not already added it + int myrand = -1; + while (alreadySelected.count(myrand) != 0) { + myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1)); + } + alreadySelected.insert(myrand); + + //update new list + string oldNames = temp->get(seqs[myrand].bin); + if (oldNames == "") { oldNames += seqs[myrand].name; } + else { oldNames += "," + seqs[myrand].name; } + + temp->set(seqs[myrand].bin, oldNames); + + //update group file + string group = groupMap->getGroup(seqs[myrand].name); + if (group == "not found") { m->mothurOut("[ERROR]: " + seqs[myrand].name + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + + outGroup << seqs[myrand].name << '\t' << group << endl; + } + + if (m->control_pressed) { return 0; } + + list->print(out); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "processListGroup"); exit(1); } } diff --git a/subsamplecommand.h b/subsamplecommand.h index f068ac3..9f3e55b 100644 --- a/subsamplecommand.h +++ b/subsamplecommand.h @@ -32,6 +32,14 @@ public: void help(); private: + + struct nameToBin { + string name; + int bin; + + nameToBin(string n, int b) : name(n), bin(b) {} + }; + GlobalData* globaldata; bool abort, pickedGroups, allLines; @@ -44,7 +52,9 @@ private: int eliminateZeroOTUS(vector&); int getSubSampleShared(); + int getSubSampleList(); int processShared(vector&, ofstream&); + int processListGroup(ListVector*&, GroupMap*&, ofstream&, ofstream&); };