X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=screenseqscommand.cpp;h=2b5ebc1a844420e699b04df0dcd6e144befbf1fe;hb=16f9c4ab6f39769856b13e048eae2c8eaa413c02;hp=686bdaf4fb43486b87b8f3ad9ba39a874c670ce0;hpb=b302d221847b504388ec044c6929e9dde42f9bb1;p=mothur.git diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 686bdaf..2b5ebc1 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -8,14 +8,15 @@ */ #include "screenseqscommand.h" - +#include "counttable.h" //********************************************************************************************************************** vector ScreenSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport); CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); @@ -44,12 +45,12 @@ vector ScreenSeqsCommand::setParameters(){ string ScreenSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The screen.seqs command reads a fastafile and creates .....\n"; - helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; + helpString += "The screen.seqs command reads a fastafile and screens sequences.\n"; + helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n"; - helpString += "The start parameter .... The default is -1.\n"; - helpString += "The end parameter .... The default is -1.\n"; + helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n"; + helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n"; helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; @@ -70,6 +71,34 @@ string ScreenSeqsCommand::getHelpString(){ exit(1); } } +//********************************************************************************************************************** +string ScreenSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ + try { + string outputFileName = ""; + map >::iterator it; + + //is this a type this command creates + it = outputTypes.find(type); + if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } + else { + if (type == "fasta") { outputFileName = "good" + m->getExtension(inputName); } + else if (type == "taxonomy") { outputFileName = "good" + m->getExtension(inputName); } + else if (type == "name") { outputFileName = "good" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "good" + m->getExtension(inputName); } + else if (type == "group") { outputFileName = "good" + m->getExtension(inputName); } + else if (type == "accnos") { outputFileName = "bad.accnos"; } + else if (type == "qfile") { outputFileName = "good" + m->getExtension(inputName); } + else if (type == "alignreport") { outputFileName = "good.align.report"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } + } + return outputFileName; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "getOutputFileNameTag"); + exit(1); + } +} + //********************************************************************************************************************** ScreenSeqsCommand::ScreenSeqsCommand(){ try { @@ -83,6 +112,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(){ outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand"); @@ -122,6 +152,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -175,6 +206,14 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -202,6 +241,19 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + alignreport = validParameter.validFile(parameters, "alignreport", true); if (alignreport == "not open") { abort = true; } else if (alignreport == "not found") { alignreport = ""; } @@ -261,10 +313,12 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; } m->mothurConvert(temp, criteria); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -285,10 +339,15 @@ int ScreenSeqsCommand::execute(){ if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here //use the namefile to optimize correctly if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } getSummary(positions); } else { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) positions = m->divideFile(fastafile, processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } #else @@ -296,6 +355,7 @@ int ScreenSeqsCommand::execute(){ else { int numFastaSeqs = 0; positions = m->setFilePosFasta(fastafile, numFastaSeqs); + if (positions.size() < processors) { processors = positions.size(); } //figure out how many sequences you have to process int numSeqsPerProcessor = numFastaSeqs / processors; @@ -308,8 +368,8 @@ int ScreenSeqsCommand::execute(){ #endif } - string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile); - string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos"; + string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta", fastafile); + string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos"); int numFastaSeqs = 0; set badSeqNames; @@ -444,7 +504,9 @@ int ScreenSeqsCommand::execute(){ screenNameGroupFile(badSeqNames); if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group - + else if (countfile != "") { screenCountFile(badSeqNames); } + + if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } if(alignreport != "") { screenAlignReport(badSeqNames); } @@ -491,6 +553,11 @@ int ScreenSeqsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); @@ -513,7 +580,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ string seqName, seqList, group; set::iterator it; - string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + "good" + m->getExtension(namefile); + string goodNameFile = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile); outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile); ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut); @@ -559,7 +626,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ ifstream inputGroups; m->openInputFile(groupfile, inputGroups); - string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile); + string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile); ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut); @@ -611,7 +678,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ vector longHomoPolymer; vector positions; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) positions = m->divideFile(fastafile, processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } #else @@ -619,6 +686,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ else { int numFastaSeqs = 0; positions = m->setFilePosFasta(fastafile, numFastaSeqs); + if (positions.size() < processors) { processors = positions.size(); } //figure out how many sequences you have to process int numSeqsPerProcessor = numFastaSeqs / processors; @@ -638,7 +706,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]); #else int numSeqs = 0; - //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]); }else{ @@ -743,7 +811,7 @@ int ScreenSeqsCommand::driverCreateSummary(vector& startPosition, vectormothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = in.tellg(); if ((pos == -1) || (pos >= filePos.end)) { break; } #else @@ -769,7 +837,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, int num = 0; vector processIDS; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //loop through and create all the processes you want while (process != processors) { @@ -891,8 +959,8 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ string seqName, group; set::iterator it; - string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "good" + m->getExtension(groupfile); - outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile); + string goodGroupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); + outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile); ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut); while(!inputGroups.eof()){ @@ -933,7 +1001,69 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ exit(1); } } +//*************************************************************************************************************** +int ScreenSeqsCommand::screenCountFile(set badSeqNames){ + try { + ifstream in; + m->openInputFile(countfile, in); + set::iterator it; + + string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile); + ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut); + + string headers = m->getline(in); m->gobble(in); + goodCountOut << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + + it = badSeqNames.find(name); + + if(it != badSeqNames.end()){ + badSeqNames.erase(it); + } + else{ + goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl; + } + } + + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + //we were unable to remove some of the bad sequences + if (badSeqNames.size() != 0) { + for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { + m->mothurOut("Your count file does not include the sequence " + *it + " please correct."); + m->mothurOutEndLine(); + } + } + + in.close(); + goodCountOut.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(goodCountFile)) { + ct.readTable(goodCountFile); + ct.printTable(goodCountFile); + } + + if (m->control_pressed) { m->mothurRemove(goodCountFile); } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenCountFile"); + exit(1); + } +} //*************************************************************************************************************** int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ @@ -943,7 +1073,7 @@ int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ string seqName, group; set::iterator it; - string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + "good" + m->getExtension(alignreport); + string goodAlignReportFile = outputDir + m->getRootName(m->getSimpleName(alignreport)) + getOutputFileNameTag("alignreport"); outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile); ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut); @@ -1007,7 +1137,7 @@ int ScreenSeqsCommand::screenTaxonomy(set badSeqNames){ string seqName, tax; set::iterator it; - string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + "good" + m->getExtension(taxonomy); + string goodTaxFile = outputDir + m->getRootName(m->getSimpleName(taxonomy)) + getOutputFileNameTag("taxonomy", taxonomy); outputNames.push_back(goodTaxFile); outputTypes["taxonomy"].push_back(goodTaxFile); ofstream goodTaxOut; m->openOutputFile(goodTaxFile, goodTaxOut); @@ -1056,7 +1186,7 @@ int ScreenSeqsCommand::screenQual(set badSeqNames){ m->openInputFile(qualfile, in); set::iterator it; - string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + "good" + m->getExtension(qualfile); + string goodQualFile = outputDir + m->getRootName(m->getSimpleName(qualfile)) + getOutputFileNameTag("qfile", qualfile); outputNames.push_back(goodQualFile); outputTypes["qfile"].push_back(goodQualFile); ofstream goodQual; m->openOutputFile(goodQualFile, goodQual); @@ -1163,7 +1293,7 @@ int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccn count++; } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos.end)) { break; } #else @@ -1207,7 +1337,6 @@ int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& int length = MPIPos[start+i+1] - MPIPos[start+i]; char* buf4 = new char[length]; - memcpy(buf4, outputString.c_str(), length); MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status); @@ -1275,7 +1404,7 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st int process = 1; int num = 0; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //loop through and create all the processes you want while (process != processors) { @@ -1356,7 +1485,7 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); } // Allocate memory for thread data. - sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension, &badSeqNames); + sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension); pDataArray.push_back(tempSum); //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier @@ -1373,6 +1502,7 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st //Close all thread handles and free memory allocations. for(int i=0; i < pDataArray.size(); i++){ num += pDataArray[i]->count; + for (set::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); } CloseHandle(hThreadArray[i]); delete pDataArray[i]; }