X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=filterseqscommand.cpp;h=95d51f329f30d8a639d0772844e54be5246e5005;hb=cd37904452dc95b183ff313ff05720c562902487;hp=f4cc6d82fbe5d97779d4d80dbba9ee9a483940ab;hpb=37a2c0b7dc68bfa53c8f38a713259929c4c46a9f;p=mothur.git diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index f4cc6d8..95d51f3 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -8,160 +8,241 @@ */ #include "filterseqscommand.h" +#include "sequence.hpp" /**************************************************************************************/ -void FilterSeqsCommand::doTrump() { - trump = globaldata->getTrump(); - for(int i = 0; i < db->size(); i++) { - Sequence cur = db->get(i); - string curAligned = cur.getAligned(); - for(int j = 0; j < curAligned.length(); j++) { - string curChar = curAligned.substr(j, 1); - if(curChar.compare(trump) == 0) - columnsToRemove[j] = true; + +FilterSeqsCommand::FilterSeqsCommand(string option){ + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta", "trump", "soft", "hard", "vertical"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not found") { cout << "fasta is a required parameter for the filter.seqs command." << endl; abort = true; } + else if (fastafile == "not open") { abort = true; } + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + + string temp; + temp = validParameter.validFile(parameters, "trump", false); if (temp == "not found") { temp = "*"; } + trump = temp[0]; + + temp = validParameter.validFile(parameters, "soft", false); if (temp == "not found") { soft = 0; } + else { soft = (float)atoi(temp.c_str()) / 100.0; } + + hard = validParameter.validFile(parameters, "hard", true); if (hard == "not found") { hard = ""; } + else if (hard == "not open") { abort = true; } + + vertical = validParameter.validFile(parameters, "vertical", false); if (vertical == "not found") { vertical = "T"; } + + numSeqs = 0; + } + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function FilterSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); } + catch(...) { + cout << "An unknown error has occurred in the FilterSeqsCommand class function FilterSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } } -/**************************************************************************************/ -void FilterSeqsCommand::doSoft() { - soft = atoi(globaldata->getSoft().c_str()); - vector > columnSymbolSums; - vector > columnSymbols; - for(int i = 0; i < db->get(0).getLength(); i++) { - vector symbols; - vector sums; - columnSymbols.push_back(symbols); - columnSymbolSums.push_back(sums); +//********************************************************************************************************************** + +void FilterSeqsCommand::help(){ + try { + cout << "The filter.seqs command reads a file containing sequences and creates a .filter and .filter.fasta file." << "\n"; + cout << "The filter.seqs command parameters are fasta, trump, soft, hard and vertical. " << "\n"; + cout << "The fasta parameter is required." << "\n"; + cout << "The trump parameter .... The default is ..." << "\n"; + cout << "The soft parameter .... The default is ...." << "\n"; + cout << "The hard parameter .... The default is ...." << "\n"; + cout << "The vertical parameter .... The default is T." << "\n"; + cout << "The filter.seqs command should be in the following format: " << "\n"; + cout << "filter.seqs(fasta=yourFastaFile, trump=yourTrump, soft=yourSoft, hard=yourHard, vertical=yourVertical) " << "\n"; + cout << "Example filter.seqs(fasta=abrecovery.fasta, trump=..., soft=..., hard=..., vertical=T)." << "\n"; + cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta)." << "\n" << "\n"; + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); } + catch(...) { + cout << "An unknown error has occurred in the FilterSeqsCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +/**************************************************************************************/ + +void FilterSeqsCommand::doHard() { - for(int i = 0; i < db->size(); i++) { - Sequence cur = db->get(i); - string curAligned = cur.getAligned(); - - for(int j = 0; j < curAligned.length(); j++) { - string curChar = curAligned.substr(j, 1); - vector curColumnSymbols = columnSymbols[j]; - bool newSymbol = true; - - for(int k = 0; k < curColumnSymbols.size(); k++) - if(curChar.compare(curColumnSymbols[k]) == 0) { - newSymbol = false; - columnSymbolSums[j][k]++; - } - - if(newSymbol) { - columnSymbols[j].push_back(curChar); - columnSymbolSums[j].push_back(1); - } + ifstream fileHandle; + openInputFile(hard, fileHandle); + + fileHandle >> filter; + +} + +/**************************************************************************************/ + +void FilterSeqsCommand::doTrump(Sequence seq) { + + string curAligned = seq.getAligned(); + + for(int j = 0; j < alignmentLength; j++) { + if(curAligned[j] == trump){ + filter[j] = '0'; } } + +} + +/**************************************************************************************/ + +void FilterSeqsCommand::doVertical() { + + for(int i=0;i curColumnSymbols = columnSymbolSums[i]; - - for(int j = 0; j < curColumnSymbols.size(); j++) { - int curSum = curColumnSymbols[j]; - //cout << columnSymbols[i][j] << ": " << curSum << "\n"; - if(curSum > max) - max = curSum; - totalSum += curSum; - } - //cout << "\n"; - - if((double)max/(double)totalSum * 100 < soft) - columnsToRemove[i] = true; + for(int i=0;igetFilter(); - ifstream filehandle; - openInputFile(filter, filehandle); + +void FilterSeqsCommand::getFreqs(Sequence seq) { + + string curAligned = seq.getAligned();; - char c; - int count = 0; - while(!filehandle.eof()) { - c = filehandle.get(); - if(c == '0') - columnsToRemove[count] = true; - count++; + for(int j=0;jinputFileName; - - if(globaldata->getFastaFile() != "") { - readSeqs = new ReadFasta(filename); } - else if(globaldata->getNexusFile() != "") { - readSeqs = new ReadNexus(filename); } - else if(globaldata->getClustalFile() != "") { - readSeqs = new ReadClustal(filename); } - else if(globaldata->getPhylipFile() != "") { - readSeqs = new ReadPhylip(filename); } - - readSeqs->read(); - db = readSeqs->getDB(); - - //for(int i = 0; i < db->size(); i++) { -// cout << db->get(i).getLength() << "\n" << db->get(i).getName() << ": " << db->get(i).getAligned() << "\n\n"; -// } - - for(int i = 0; i < db->get(0).getLength(); i++) - columnsToRemove.push_back(false); - - - if(globaldata->getTrump().compare("") != 0) - doTrump(); - else if(globaldata->getSoft().compare("") != 0) - doSoft(); - else if(globaldata->getFilter().compare("") != 0) - doFilter(); - - //for(int i = 0; i < columnsToRemove.size(); i++) -// { -// cout << "Remove Column " << i << " = "; -// if(columnsToRemove[i]) -// cout << "true\n"; -// else -// cout << "false\n"; -// } - - - //Creating the new SequenceDB - SequenceDB newDB; - for(int i = 0; i < db->size(); i++) { - Sequence curSeq = db->get(i); - string curAligned = curSeq.getAligned(); - string curName = curSeq.getName(); - string newAligned = ""; - for(int j = 0; j < curAligned.length(); j++) - if(!columnsToRemove[j]) - newAligned += curAligned.substr(j, 1); - - Sequence newSeq(curName, newAligned); - newDB.add(newSeq); + + if (abort == true) { return 0; } + + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + + Sequence testSeq(inFASTA); + alignmentLength = testSeq.getAlignLength(); + inFASTA.seekg(0); + + if(soft != 0 || isTrue(vertical)){ + a.assign(alignmentLength, 0); + t.assign(alignmentLength, 0); + g.assign(alignmentLength, 0); + c.assign(alignmentLength, 0); + gap.assign(alignmentLength, 0); } - string newFileName = getRootName(filename) + "filter.fa"; - ofstream outfile; - outfile.open(newFileName.c_str()); - newDB.print(outfile); - outfile.close(); - - globaldata->clear(); - //delete db; - //delete newDB; + if(hard.compare("") != 0) { doHard(); } + else { filter = string(alignmentLength, '1'); } + + if(trump != '*' || isTrue(vertical) || soft != 0){ + while(!inFASTA.eof()){ //read through and create the filter... + Sequence seq(inFASTA); + if(trump != '*'){ doTrump(seq); } + if(isTrue(vertical) || soft != 0){ getFreqs(seq); } + numSeqs++; + cout.flush(); + } + + } + inFASTA.close(); + + if(isTrue(vertical) == 1) { doVertical(); } + if(soft != 0) { doSoft(); } + + ofstream outFilter; + string filterFile = getRootName(fastafile) + "filter"; + openOutputFile(filterFile, outFilter); + outFilter << filter << endl; + outFilter.close(); + + + openInputFile(fastafile, inFASTA); + string filteredFasta = getRootName(fastafile) + "filter.fasta"; + ofstream outFASTA; + openOutputFile(filteredFasta, outFASTA); + + numSeqs = 0; + while(!inFASTA.eof()){ + Sequence seq(inFASTA); + string align = seq.getAligned(); + string filterSeq = ""; + + for(int j=0;j' << seq.getName() << endl << filterSeq << endl; + numSeqs++; + gobble(inFASTA); + } + outFASTA.close(); + inFASTA.close(); + + + int filteredLength = 0; + for(int i=0;i