X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=d72ada4cff4fb44cd557eede88d63f44a43c1cd6;hb=5c80ce8b80938d41cf6c64a017fa6fd50d45de5b;hp=77da3a60f7534dca70f65d73c262c27bf9734648;hpb=0470f6d037aacb3563c3f7010708120a4a67d4e6;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 77da3a6..d72ada4 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -8,13 +8,14 @@ */ #include "trimseqscommand.h" +#include "needlemanoverlap.hpp" +#include "nast.hpp" //*************************************************************************************************************** -TrimSeqsCommand::TrimSeqsCommand(string option){ +TrimSeqsCommand::TrimSeqsCommand(string option) { try { - globaldata = GlobalData::getInstance(); abort = false; //allow user to run help @@ -22,91 +23,162 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ else { //valid paramters for this command - string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength"}; + string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", + "qthreshold", "qaverage", "allfiles", "qtrim","diffs", "processors", "outputdir","inputdir"}; + vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); - parser = new OptionParser(); - parser->parse(option, parameters); delete parser; + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; - ValidParameters* validParameter = new ValidParameters(); - //check to make sure all parameters are valid for command for (it = parameters.begin(); it != parameters.end(); it++) { - if (validParameter->isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("oligos"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["oligos"] = inputDir + it->second; } + } + + it = parameters.find("qfile"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["qfile"] = inputDir + it->second; } + } } + //check for required parameters - fastafile = validParameter->validFile(parameters, "fasta", true); - if (fastafile == "not found") { cout << "fasta is a required parameter for the screen.seqs command." << endl; abort = true; } - else if (fastafile == "not open") { abort = true; } - else { globaldata->setFastaFile(fastafile); } - + fastaFile = validParameter.validFile(parameters, "fasta", true); + if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; } + else if (fastaFile == "not open") { abort = true; } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it + } //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; - temp = validParameter->validFile(parameters, "flip", false); if (temp == "not found") { temp = "0"; } - if(isTrue(temp)) { flip = 1; } + temp = validParameter.validFile(parameters, "flip", false); + if (temp == "not found"){ flip = 0; } + else if(isTrue(temp)) { flip = 1; } - temp = validParameter->validFile(parameters, "oligos", false); if (temp == "not found") { temp = ""; } - if(temp != "") { oligos = 1; } - else { oligos = 0; } - - temp = validParameter->validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; } + temp = validParameter.validFile(parameters, "oligos", true); + if (temp == "not found"){ oligoFile = ""; } + else if(temp == "not open"){ abort = true; } + else { oligoFile = temp; } + + temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; } convert(temp, maxAmbig); - temp = validParameter->validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; } + temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; } convert(temp, maxHomoP); - temp = validParameter->validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; } + temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; } convert(temp, minLength); - temp = validParameter->validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } - convert(temp, maxLength); + temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } + convert(temp, maxLength); + + temp = validParameter.validFile(parameters, "diffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, diffs); + + temp = validParameter.validFile(parameters, "qfile", true); + if (temp == "not found") { qFileName = ""; } + else if(temp == "not open") { abort = true; } + else { qFileName = temp; } + + temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } + convert(temp, qThreshold); + + temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "F"; } + qtrim = isTrue(temp); + + temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; } + convert(temp, qAverage); - if(!flip && !oligos && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP ){ cout << "huh?" << endl; } + temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } + allFiles = isTrue(temp); - delete validParameter; + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } + convert(temp, processors); + + if(allFiles && oligoFile == ""){ + m->mothurOut("You selected allfiles, but didn't enter an oligos file. Ignoring the allfiles request."); m->mothurOutEndLine(); + } + if((qAverage != 0 && qThreshold != 0) && qFileName == ""){ + m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine(); + qAverage=0; + qThreshold=0; + } + if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){ + m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine(); + abort = true; + } } } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function TrimSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function TrimSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } } //********************************************************************************************************************** void TrimSeqsCommand::help(){ try { - cout << "The trim.seqs command reads a fastafile and creates ....." << "\n"; - cout << "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength and maxlength." << "\n"; - cout << "The fasta parameter is required." << "\n"; - cout << "The flip parameter .... The default is 0." << "\n"; - cout << "The oligos parameter .... The default is ""." << "\n"; - cout << "The maxambig parameter .... The default is -1." << "\n"; - cout << "The maxhomop parameter .... The default is 0." << "\n"; - cout << "The minlength parameter .... The default is 0." << "\n"; - cout << "The maxlength parameter .... The default is 0." << "\n"; - cout << "The trim.seqs command should be in the following format: " << "\n"; - cout << "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, " << "\n"; - cout << "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) " << "\n"; - cout << "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...)." << "\n"; - cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta)." << "\n" << "\n"; + m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); + m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n"); + m->mothurOut("The fasta parameter is required.\n"); + m->mothurOut("The flip parameter .... The default is 0.\n"); + m->mothurOut("The oligos parameter .... The default is "".\n"); + m->mothurOut("The maxambig parameter .... The default is -1.\n"); + m->mothurOut("The maxhomop parameter .... The default is 0.\n"); + m->mothurOut("The minlength parameter .... The default is 0.\n"); + m->mothurOut("The maxlength parameter .... The default is 0.\n"); + m->mothurOut("The diffs parameter .... The default is 0.\n"); + m->mothurOut("The qfile parameter .....\n"); + m->mothurOut("The qthreshold parameter .... The default is 0.\n"); + m->mothurOut("The qaverage parameter .... The default is 0.\n"); + m->mothurOut("The allfiles parameter .... The default is F.\n"); + m->mothurOut("The qtrim parameter .... The default is F.\n"); + m->mothurOut("The trim.seqs command should be in the following format: \n"); + m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n"); + m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"); + m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); + m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n"); } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + m->errorOut(e, "TrimSeqsCommand", "help"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } } @@ -120,242 +192,730 @@ int TrimSeqsCommand::execute(){ try{ if (abort == true) { return 0; } - - getOligos(); - ifstream inFASTA; - openInputFile(fastafile, inFASTA); + numFPrimers = 0; //this needs to be initialized + numRPrimers = 0; + + string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta"; + outputNames.push_back(trimSeqFile); + string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta"; + outputNames.push_back(scrapSeqFile); + string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups"; + + vector fastaFileNames; + if(oligoFile != ""){ + outputNames.push_back(groupFile); + getOligos(fastaFileNames); + } + + if(qFileName != "") { setLines(qFileName, qLines); } - ofstream outFASTA; - string trimSeqFile = getRootName(fastafile) + "trim.fasta"; - openOutputFile(trimSeqFile, outFASTA); - ofstream outGroups; - string groupFile = getRootName(fastafile) + "groups"; - openOutputFile(groupFile, outGroups); + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastaFile, inFASTA); + int numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); + + for (int j = 0; j < fastaFileNames.size(); j++) { + rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str()); + } + + }else{ + setLines(fastaFile, lines); + if(qFileName == "") { qLines = lines; } + + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); + + rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str()); + rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str()); + rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str()); + for (int j = 0; j < fastaFileNames.size(); j++) { + rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str()); + } + //append files + for(int i=1;icontrol_pressed) { return 0; } + #else + ifstream inFASTA; + openInputFile(fastafileNames[s], inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driverCreateSummary(fastafile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); + + if (m->control_pressed) { return 0; } + #endif + + + for(int i=0;i'){ + inFASTA >> seqName; + outGroups << seqName << '\t' << groupVector[i] << endl; + } + while (!inFASTA.eof()) { char c = inFASTA.get(); if (c == 10 || c == 13){ break; } } + } + outGroups.close(); + inFASTA.close(); + } + + if (m->control_pressed) { + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "execute"); + exit(1); + } +} + +/**************************************************************************************/ +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector fastaNames, linePair* line, linePair* qline) { + try { + + ofstream outFASTA; + int able = openOutputFile(trimFile, outFASTA); + ofstream scrapFASTA; - string scrapSeqFile = getRootName(fastafile) + "scrap.fasta"; - openOutputFile(scrapSeqFile, scrapFASTA); - - bool success; + openOutputFile(scrapFile, scrapFASTA); - while(!inFASTA.eof()){ - Sequence currSeq(inFASTA); - string origSeq = currSeq.getUnaligned(); - string group; - string trashCode = ""; - - if(barcodes.size() != 0){ - success = stripBarcode(currSeq, group); - if(!success){ trashCode += 'b'; } - } - if(numFPrimers != 0){ - success = stripForward(currSeq); - if(!success){ trashCode += 'f'; } - } - if(numRPrimers != 0){ - success = stripReverse(currSeq); - if(!success){ trashCode += 'r'; } - } - if(minLength > 0 || maxLength > 0){ - success = cullLength(currSeq); - if(!success){ trashCode += 'l'; } - } - if(maxHomoP > 0){ - success = cullHomoP(currSeq); - if(!success){ trashCode += 'h'; } + ofstream outGroups; + vector fastaFileNames; + if (oligoFile != "") { + openOutputFile(groupFile, outGroups); + for (int i = 0; i < fastaNames.size(); i++) { + fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); } - if(maxAmbig != -1){ - success = cullAmbigs(currSeq); - if(!success){ trashCode += 'n'; } + } + + ifstream inFASTA; + openInputFile(filename, inFASTA); + + ifstream qFile; + if(qFileName != "") { openInputFile(qFileName, qFile); } + + qFile.seekg(qline->start); + inFASTA.seekg(line->start); + + for(int i=0;inum;i++){ + + if (m->control_pressed) { + inFASTA.close(); + outFASTA.close(); + scrapFASTA.close(); + if (oligoFile != "") { outGroups.close(); } + if(qFileName != "") { qFile.close(); } + for(int i=0;iclose(); + delete fastaFileNames[i]; + } + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; } - if(flip){ currSeq.reverseComplement(); } // should go last + bool success = 1; + + Sequence currSeq(inFASTA); - if(trashCode.length() == 0){ - currSeq.printSequence(outFASTA); - outGroups << currSeq.getName() << '\t' << group << endl; - } - else{ - currSeq.setName(currSeq.getName() + '|' + trashCode); - currSeq.setUnaligned(origSeq); - currSeq.printSequence(scrapFASTA); - } + string origSeq = currSeq.getUnaligned(); + if (origSeq != "") { + int group; + string trashCode = ""; + + if(qFileName != ""){ + if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } + else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } + if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { + success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap + } + if(!success) { trashCode += 'q'; } + } + if(barcodes.size() != 0){ + success = stripBarcode(currSeq, group); + if(!success){ trashCode += 'b'; } + } + + if(numFPrimers != 0){ + success = stripForward(currSeq); + if(!success){ trashCode += 'f'; } + } + + if(numRPrimers != 0){ + success = stripReverse(currSeq); + if(!success){ trashCode += 'r'; } + } + + if(minLength > 0 || maxLength > 0){ + success = cullLength(currSeq); + if(!success){ trashCode += 'l'; } + } + if(maxHomoP > 0){ + success = cullHomoP(currSeq); + if(!success){ trashCode += 'h'; } + } + if(maxAmbig != -1){ + success = cullAmbigs(currSeq); + if(!success){ trashCode += 'n'; } + } + + if(flip){ currSeq.reverseComplement(); } // should go last + + if(trashCode.length() == 0){ + currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed. + currSeq.printSequence(outFASTA); + if(barcodes.size() != 0){ + outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; + + if(allFiles){ + currSeq.printSequence(*fastaFileNames[group]); + } + } + } + else{ + currSeq.setName(currSeq.getName() + '|' + trashCode); + currSeq.setUnaligned(origSeq); + currSeq.printSequence(scrapFASTA); + } + } gobble(inFASTA); } + inFASTA.close(); outFASTA.close(); scrapFASTA.close(); - outGroups.close(); + if (oligoFile != "") { outGroups.close(); } + if(qFileName != "") { qFile.close(); } - return 0; + for(int i=0;iclose(); + delete fastaFileNames[i]; + } + + return 0; } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; +} +/**************************************************************************************************/ +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector fastaNames) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + int exitCommand = 1; + processIDS.clear(); + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]); + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;ierrorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim"); exit(1); } } +/**************************************************************************************************/ -//*************************************************************************************************************** - -void TrimSeqsCommand::getOligos(){ - - ifstream inOligos; - //openInputFile(globaldata->getOligosFile(), inOligos); +int TrimSeqsCommand::setLines(string filename, vector& lines) { + try { + + lines.clear(); + + vector positions; + + ifstream inFASTA; + openInputFile(filename, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); - string type, oligo, group; + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + int numFastaSeqs = positions.size(); - while(!inOligos.eof()){ - inOligos >> type; + FILE * pFile; + long size; - if(type == "forward"){ - inOligos >> oligo; - forPrimer.push_back(oligo); - } - else if(type == "reverse"){ - inOligos >> oligo; - revPrimer.push_back(oligo); + //get num bytes in file + pFile = fopen (filename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); } - else if(type == "barcode"){ - inOligos >> oligo >> group; - barcodes[oligo]=group; - } - else if(type[0] == '#'){ - char c; - while ((c = inOligos.get()) != EOF) { if (c == 10){ break; } } // get rest of line + + int numSeqsPerProcessor = numFastaSeqs / processors; + + for (int i = 0; i < processors; i++) { + + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; + }else{ + long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); } - gobble(inOligos); + return numFastaSeqs; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "setLines"); + exit(1); } - - numFPrimers = forPrimer.size(); - numRPrimers = revPrimer.size(); } - //*************************************************************************************************************** -bool TrimSeqsCommand::stripBarcode(Sequence& seq, string& group){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent +void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vector& outFASTAVec + try { + ifstream inOligos; + openInputFile(oligoFile, inOligos); + + ofstream test; + + string type, oligo, group; + int index=0; + + while(!inOligos.eof()){ + inOligos >> type; + + if(type[0] == '#'){ + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + } + else{ + inOligos >> oligo; + + for(int i=0;i> group; + barcodes[oligo]=index++; + groupVector.push_back(group); + + if(allFiles){ + //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate)); + outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); + outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); + } + } + } + } + + inOligos.close(); + + numFPrimers = forPrimer.size(); + numRPrimers = revPrimer.size(); + + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getOligos"); + exit(1); + } +} +//*************************************************************************************************************** - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; +bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ + try { + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = 0; - break; + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = 0; + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + success = 1; + break; + } } - if (rawSequence.compare(0,oligo.length(),oligo) == 0){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 1; - break; + //if you found the barcode or if you don't want to allow for diffs + if ((diffs == 0) || (success == 1)) { return success; } + + else { //try aligning and see if you can find it + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = 0; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + Alignment* alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (oligo.length()+diffs+1)); + Sequence* templateSeq = new Sequence("temp", rawSequence.substr(0,(oligo.length()+diffs))); + Sequence* candidateSeq = new Sequence("temp2", oligo); + Nast nast(alignment, candidateSeq, templateSeq); + + oligo = candidateSeq->getAligned(); + cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,(oligo.length())) << endl; + delete alignment; + delete templateSeq; + delete candidateSeq; + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(0,oligo.length())); + success = 1; + break; + } + } } + return success; + } - return success; - + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "stripBarcode"); + exit(1); + } + } //*************************************************************************************************************** bool TrimSeqsCommand::stripForward(Sequence& seq){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;igetAligned(); + + delete alignment; + delete templateSeq; + delete candidateSeq; + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + seq.setUnaligned(rawSequence.substr(0,oligo.length())); + success = 1; + break; + } + } } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "stripForward"); + exit(1); } - - return success; - } //*************************************************************************************************************** bool TrimSeqsCommand::stripReverse(Sequence& seq){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "stripReverse"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullLength(Sequence& seq){ + try { - int length = seq.getNumBases(); - bool success = 0; //guilty until proven innocent - - if(length >= minLength && maxLength == 0) { success = 1; } - else if(length >= minLength && length <= maxLength) { success = 1; } - else { success = 0; } + int length = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + + if(length >= minLength && maxLength == 0) { success = 1; } + else if(length >= minLength && length <= maxLength) { success = 1; } + else { success = 0; } + + return success; - return success; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "cullLength"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullHomoP(Sequence& seq){ - - int longHomoP = seq.getLongHomoPolymer(); - bool success = 0; //guilty until proven innocent - - if(longHomoP <= maxHomoP){ success = 1; } - else { success = 0; } - - return success; + try { + int longHomoP = seq.getLongHomoPolymer(); + bool success = 0; //guilty until proven innocent + + if(longHomoP <= maxHomoP){ success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "cullHomoP"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullAmbigs(Sequence& seq){ + try { + int numNs = seq.getAmbigBases(); + bool success = 0; //guilty until proven innocent + + if(numNs <= maxAmbig) { success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "cullAmbigs"); + exit(1); + } - int numNs = seq.getAmbigBases(); - bool success = 0; //guilty until proven innocent - - if(numNs <= maxAmbig){ success = 1; } - else { success = 0; } +} + +//*************************************************************************************************************** + +bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ + try { + bool success = 1; + int length = oligo.length(); + + for(int i=0;ierrorOut(e, "TrimSeqsCommand", "compareDNASeq"); + exit(1); + } + +} + +//*************************************************************************************************************** + +bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ + try { + string rawSequence = seq.getUnaligned(); + int seqLength; // = rawSequence.length(); + string name, temp, temp2; + + qFile >> name >> temp; - return success; + splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242 + convert(temp, seqLength); //converts string to int + if (name.length() != 0) { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); } } + while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } + + int score; + int end = seqLength; + + for(int i=0;i> score; + + if(score <= qThreshold){ + end = i; + break; + } + } + for(int i=end+1;i> score; + } + + seq.setUnaligned(rawSequence.substr(0,end)); + + return 1; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); + exit(1); + } +} + +//*************************************************************************************************************** + +bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ + try { + string rawSequence = seq.getUnaligned(); + int seqLength = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + string name; + + qFile >> name; + if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } + + while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } + + float score; + float average = 0; + + for(int i=0;i> score; + average += score; + } + average /= seqLength; + + if(average >= qAverage) { success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "cullQualAverage"); + exit(1); + } } //***************************************************************************************************************