X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=filterseqscommand.cpp;h=0e33ab2047359b432886384cc41d60a806fbdc7c;hb=5d176c0f8049654ec3ae5a869c9ee3cecb991dc6;hp=d45d916296583824cbc260535a98f7ae850f882a;hpb=74844a60d80c6dd06e3fb02ee9b928424f9019b0;p=mothur.git diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index d45d916..0e33ab2 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -15,13 +15,14 @@ FilterSeqsCommand::FilterSeqsCommand(string option) { try { abort = false; + filterFileName = ""; //allow user to run help if(option == "help") { help(); abort = true; } else { //valid paramters for this command - string Array[] = {"fasta", "trump", "soft", "hard", "vertical", "outputdir","inputdir"}; + string Array[] = {"fasta", "trump", "soft", "hard", "vertical", "outputdir","inputdir", "processors"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -58,16 +59,45 @@ FilterSeqsCommand::FilterSeqsCommand(string option) { } //check for required parameters - fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the filter.seqs command."); m->mothurOutEndLine(); abort = true; } - else if (fastafile == "not open") { abort = true; } - - //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ - outputDir = ""; - outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it - } + fasta = validParameter.validFile(parameters, "fasta", false); + if (fasta == "not found") { m->mothurOut("fasta is a required parameter for the filter.seqs command."); m->mothurOutEndLine(); abort = true; } + else { + splitAtDash(fasta, fastafileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < fastafileNames.size(); i++) { + if (inputDir != "") { + string path = hasPath(fastafileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { fastafileNames[i] = inputDir + fastafileNames[i]; } + } + int ableToOpen; + ifstream in; + ableToOpen = openInputFile(fastafileNames[i], in); + if (ableToOpen == 1) { + m->mothurOut(fastafileNames[i] + " will be disregarded."); m->mothurOutEndLine(); + //erase from file list + fastafileNames.erase(fastafileNames.begin()+i); + i--; + }else{ + string simpleName = getSimpleName(fastafileNames[i]); + filterFileName += simpleName.substr(0, simpleName.find_first_of('.')); + } + in.close(); + } + + //make sure there is at least one valid file left + if (fastafileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } + } + + if (!abort) { + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(fastafileNames[0]); //if user entered a file with a path then preserve it + } + } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -78,6 +108,9 @@ FilterSeqsCommand::FilterSeqsCommand(string option) { temp = validParameter.validFile(parameters, "soft", false); if (temp == "not found") { soft = 0; } else { soft = (float)atoi(temp.c_str()) / 100.0; } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } + convert(temp, processors); + hard = validParameter.validFile(parameters, "hard", true); if (hard == "not found") { hard = ""; } else if (hard == "not open") { abort = true; } @@ -85,13 +118,6 @@ FilterSeqsCommand::FilterSeqsCommand(string option) { numSeqs = 0; - if (abort == false) { - - if (soft != 0) { F.setSoft(soft); } - if (trump != '*') { F.setTrump(trump); } - - } - } } @@ -107,7 +133,8 @@ void FilterSeqsCommand::help(){ try { m->mothurOut("The filter.seqs command reads a file containing sequences and creates a .filter and .filter.fasta file.\n"); m->mothurOut("The filter.seqs command parameters are fasta, trump, soft, hard and vertical. \n"); - m->mothurOut("The fasta parameter is required.\n"); + m->mothurOut("The fasta parameter is required. You may enter several fasta files to build the filter from and filter, by separating their names with -'s.\n"); + m->mothurOut("For example: fasta=abrecovery.fasta-amazon.fasta \n"); m->mothurOut("The trump parameter .... The default is ...\n"); m->mothurOut("The soft parameter .... The default is ....\n"); m->mothurOut("The hard parameter .... The default is ....\n"); @@ -130,83 +157,71 @@ int FilterSeqsCommand::execute() { try { if (abort == true) { return 0; } + vector outputNames; ifstream inFASTA; - openInputFile(fastafile, inFASTA); + openInputFile(fastafileNames[0], inFASTA); Sequence testSeq(inFASTA); alignmentLength = testSeq.getAlignLength(); - inFASTA.seekg(0); - - F.setLength(alignmentLength); - - if(soft != 0 || isTrue(vertical)){ - F.initialize(); - } - - if(hard.compare("") != 0) { F.doHard(hard); } - else { F.setFilter(string(alignmentLength, '1')); } - - if(trump != '*' || isTrue(vertical) || soft != 0){ - while(!inFASTA.eof()){ //read through and create the filter... - Sequence seq(inFASTA); - if (seq.getName() != "") { - if(trump != '*'){ F.doTrump(seq); } - if(isTrue(vertical) || soft != 0){ F.getFreqs(seq); } - numSeqs++; - cout.flush(); - } - } - - } inFASTA.close(); - F.setNumSeqs(numSeqs); + ////////////create filter///////////////// - if(isTrue(vertical) == 1) { F.doVertical(); } - if(soft != 0) { F.doSoft(); } + filter = createFilter(); - filter = F.getFilter(); - ofstream outFilter; - string filterFile = outputDir + getRootName(getSimpleName(fastafile)) + "filter"; + + string filterFile = outputDir + filterFileName + ".filter"; openOutputFile(filterFile, outFilter); outFilter << filter << endl; outFilter.close(); + outputNames.push_back(filterFile); + + + ////////////run filter///////////////// - ifstream inFasta2; - openInputFile(fastafile, inFasta2); - string filteredFasta = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; - ofstream outFASTA; - openOutputFile(filteredFasta, outFASTA); - numSeqs = 0; - while(!inFasta2.eof()){ - Sequence seq(inFasta2); - if (seq.getName() != "") { - string align = seq.getAligned(); - string filterSeq = ""; + for (int i = 0; i < fastafileNames.size(); i++) { + ifstream in; + openInputFile(fastafileNames[i], in); + string filteredFasta = outputDir + getRootName(getSimpleName(fastafileNames[i])) + "filter.fasta"; + ofstream outFASTA; + openOutputFile(filteredFasta, outFASTA); + outputNames.push_back(filteredFasta); + + + while(!in.eof()){ + if (m->control_pressed) { in.close(); outFASTA.close(); for(int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - for(int j=0;j' << seq.getName() << endl << filterSeq << endl; + numSeqs++; } - - outFASTA << '>' << seq.getName() << endl << filterSeq << endl; - numSeqs++; + gobble(in); } - gobble(inFasta2); + outFASTA.close(); + in.close(); } - outFASTA.close(); - inFasta2.close(); - int filteredLength = 0; for(int i=0;icontrol_pressed) { for(int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + m->mothurOutEndLine(); m->mothurOut("Length of filtered alignment: " + toString(filteredLength)); m->mothurOutEndLine(); m->mothurOut("Number of columns removed: " + toString((alignmentLength-filteredLength))); m->mothurOutEndLine(); @@ -216,8 +231,7 @@ int FilterSeqsCommand::execute() { m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(filterFile); m->mothurOutEndLine(); - m->mothurOut(filteredFasta); m->mothurOutEndLine(); + for(int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); return 0; @@ -228,5 +242,170 @@ int FilterSeqsCommand::execute() { exit(1); } } +/**************************************************************************************/ +string FilterSeqsCommand::createFilter() { + try { + string filterString = ""; + + Filters F; + + if (soft != 0) { F.setSoft(soft); } + if (trump != '*') { F.setTrump(trump); } + + F.setLength(alignmentLength); + + if(soft != 0 || isTrue(vertical)){ + F.initialize(); + } + + if(hard.compare("") != 0) { F.doHard(hard); } + else { F.setFilter(string(alignmentLength, '1')); } + + numSeqs = 0; + + if(trump != '*' || isTrue(vertical) || soft != 0){ + for (int s = 0; s < fastafileNames.size(); s++) { + + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastafileNames[s], inFASTA); + int numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + numSeqs += numFastaSeqs; + + lines.push_back(new linePair(0, numFastaSeqs)); + + driverCreateFilter(F, fastafileNames[s], lines[0]); + }else{ + vector positions; + + ifstream inFASTA; + openInputFile(fastafileNames[s], inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + int numFastaSeqs = positions.size(); + + numSeqs += numFastaSeqs; + + int numSeqsPerProcessor = numFastaSeqs / processors; + + for (int i = 0; i < processors; i++) { + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + + createProcessesCreateFilter(F, fastafileNames[s]); + } + #else + ifstream inFASTA; + openInputFile(fastafileNames[s], inFASTA); + int numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + numSeqs += numFastaSeqs; + + lines.push_back(new linePair(0, numFastaSeqs)); + + driverCreateFilter(F, lines[0], fastafileNames[s]); + #endif + + + } + } + + F.setNumSeqs(numSeqs); + + if(isTrue(vertical) == 1) { F.doVertical(); } + if(soft != 0) { F.doSoft(); } + + filterString = F.getFilter(); + + return filterString; + } + catch(exception& e) { + m->errorOut(e, "FilterSeqsCommand", "createFilter"); + exit(1); + } +} +/**************************************************************************************/ +int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* line) { + try { + + ifstream in; + openInputFile(filename, in); + + in.seekg(line->start); + + for(int i=0;inumSeqs;i++){ + + if (m->control_pressed) { in.close(); return 1; } + + Sequence seq(in); + if (seq.getName() != "") { + if(trump != '*'){ F.doTrump(seq); } + if(isTrue(vertical) || soft != 0){ F.getFreqs(seq); } + cout.flush(); + } + } + + in.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "FilterSeqsCommand", "driverCreateFilter"); + exit(1); + } +} +/**************************************************************************************************/ +int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + int exitCommand = 1; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = vfork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driverCreateFilter(F, filename, lines[process]); + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;ierrorOut(e, "FilterSeqsCommand", "createProcessesCreateFilter"); + exit(1); + } +} /**************************************************************************************/