X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=filterseqscommand.cpp;h=a6bd54982e9ca509463f5d40822d1ae232e83bb1;hb=5b7ac70116137b52dd7884b76c5bca660a5fea38;hp=dec53a5b6f72d5f365295cfc71386bbe7797d0b2;hpb=3117b1c3109121dff476997d3c5db5b47a77729b;p=mothur.git diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index dec53a5..a6bd549 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -8,140 +8,162 @@ */ #include "filterseqscommand.h" -#include -#include /**************************************************************************************/ -void FilterSeqsCommand::doTrump() { - trump = globaldata->getTrump(); - for(int i = 0; i < db->size(); i++) { - Sequence cur = db->get(i); - string curAligned = cur.getAligned(); - for(int j = 0; j < curAligned.length(); j++) { - string curChar = curAligned.substr(j, 1); - if(curChar.compare(trump) == 0) - columnsToRemove[j] = true; - } - } + +FilterSeqsCommand::FilterSeqsCommand(){ + globaldata = GlobalData::getInstance(); + + if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); } + else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); } + else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); } + else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); } + + readSeqs->read(); + db = readSeqs->getDB(); + numSeqs = db->size(); + + alignmentLength = db->get(0).getAlignLength(); + + filter = string(alignmentLength, '1'); } /**************************************************************************************/ -void FilterSeqsCommand::doSoft() { - soft = atoi(globaldata->getSoft().c_str()); - vector > columnSymbolSums; - vector > columnSymbols; - for(int i = 0; i < db->get(0).getLength(); i++) { - vector symbols; - vector sums; - columnSymbols.push_back(symbols); - columnSymbolSums.push_back(sums); - } + +void FilterSeqsCommand::doHard() { - for(int i = 0; i < db->size(); i++) { - Sequence cur = db->get(i); - string curAligned = cur.getAligned(); + string hardName = globaldata->getHard(); + string hardFilter = ""; - for(int j = 0; j < curAligned.length(); j++) { - string curChar = curAligned.substr(j, 1); - vector curColumnSymbols = columnSymbols[j]; - bool newSymbol = true; - - for(int k = 0; k < curColumnSymbols.size(); k++) - if(curChar.compare(curColumnSymbols[k]) == 0) { - newSymbol = false; - columnSymbolSums[j][k]++; - } - - if(newSymbol) { - columnSymbols[j].push_back(curChar); - columnSymbolSums[j].push_back(1); + ifstream fileHandle; + openInputFile(hardName, fileHandle); + + fileHandle >> hardFilter; + + if(hardFilter.length() != filter.length()){ + cout << "The hard filter is not the same length as the alignment: Hard filter will not be applied." << endl; + } + else{ + filter = hardFilter; + } + +} + +/**************************************************************************************/ + +void FilterSeqsCommand::doTrump() { + + char trump = globaldata->getTrump()[0]; + + for(int i = 0; i < numSeqs; i++) { + string curAligned = db->get(i).getAligned();; + + for(int j = 0; j < alignmentLength; j++) { + if(curAligned[j] == trump){ + filter[j] = '0'; } } } + +} + +/**************************************************************************************/ + +void FilterSeqsCommand::doVertical() { + + vector counts(alignmentLength, 0); - - for(int i = 0; i < columnSymbolSums.size(); i++) { - int totalSum = 0; - int max = 0; - vector curColumnSymbols = columnSymbolSums[i]; + for(int i = 0; i < numSeqs; i++) { + string curAligned = db->get(i).getAligned();; - for(int j = 0; j < curColumnSymbols.size(); j++) { - int curSum = curColumnSymbols[j]; - //cout << columnSymbols[i][j] << ": " << curSum << "\n"; - if(curSum > max) - max = curSum; - totalSum += curSum; + for(int j = 0; j < alignmentLength; j++) { + if(curAligned[j] == '-' || curAligned[j] == '.'){ + counts[j]++; + } } - //cout << "\n"; - - if((double)max/(double)totalSum * 100 < soft) - columnsToRemove[i] = true; + } + for(int i=0;igetFilter(); - ifstream filehandle; - openInputFile(filter, filehandle); + +void FilterSeqsCommand::doSoft() { + + int softThreshold = numSeqs * (float)atoi(globaldata->getSoft().c_str()) / 100.0; + + vector a(alignmentLength, 0); + vector t(alignmentLength, 0); + vector g(alignmentLength, 0); + vector c(alignmentLength, 0); + vector x(alignmentLength, 0); - char c; - int count = 0; - while(!filehandle.eof()) { - c = filehandle.get(); - if(c == '0') - columnsToRemove[count] = true; - count++; + for(int i=0;iget(i).getAligned();; + + for(int j=0;jgSequenceDB; - - for(int i = 0; i < db->get(0).getLength(); i++) - columnsToRemove.push_back(false); + + if(globaldata->getHard().compare("") != 0) { doHard(); } // has to be applied first! + if(globaldata->getTrump().compare("") != 0) { doTrump(); } + if(globaldata->getVertical() == "T") { doVertical(); } + if(globaldata->getSoft().compare("") != 0) { doSoft(); } + + ofstream outfile; + string filterFile = getRootName(globaldata->inputFileName) + "filter"; + openOutputFile(filterFile, outfile); + + outfile << filter << endl; + outfile.close(); - - if(globaldata->getTrump().compare("") != 0) - doTrump(); - else if(globaldata->getSoft().compare("") != 0) - doSoft(); - - else if(globaldata->getFilter().compare("") != 0) - doFilter(); + string filteredFasta = getRootName(globaldata->inputFileName) + "filter.fasta"; + openOutputFile(filteredFasta, outfile); + + for(int i=0;iget(i).getAligned(); + outfile << '>' << db->get(i).getName() << endl; + for(int j=0;jsize(); i++) { - Sequence curSeq = db->get(i); - string curAligned = curSeq.getAligned(); - string curName = curSeq.getName(); - string newAligned = ""; - for(int j = 0; j < curAligned.length(); j++) - if(!columnsToRemove[j]) - newAligned += curAligned.substr(j, 1); - - Sequence newSeq(curName, newAligned); - newDB.add(newSeq); + int filteredLength = 0; + for(int i=0;iclear(); + return 0; + } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; @@ -152,4 +174,5 @@ int FilterSeqsCommand::execute() { exit(1); } } + /**************************************************************************************/