X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=5e0a6471010f6d66dafa82037adac1fdb5ab518a;hb=c4eb056b457b3832c42fce61d1843b54ca20aa3c;hp=ca7a927134f1f904401ab017d456ff1f6859a569;hpb=648ec37228eb16075ace911dd5a5773cdfe683da;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index ca7a927..5e0a647 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -8,20 +8,23 @@ */ #include "trimseqscommand.h" +#include "needlemanoverlap.hpp" //*************************************************************************************************************** -TrimSeqsCommand::TrimSeqsCommand(string option){ +TrimSeqsCommand::TrimSeqsCommand(string option) { try { abort = false; + comboStarts = 0; //allow user to run help if(option == "help") { help(); abort = true; } else { //valid paramters for this command - string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", "qthreshold", "qaverage", "allfiles", "qtrim"}; + string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", + "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -29,17 +32,54 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ map parameters = parser.getParameters(); ValidParameters validParameter; + map::iterator it; //check to make sure all parameters are valid for command - for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + for (it = parameters.begin(); it != parameters.end(); it++) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("oligos"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["oligos"] = inputDir + it->second; } + } + + it = parameters.find("qfile"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["qfile"] = inputDir + it->second; } + } + } + + //check for required parameters fastaFile = validParameter.validFile(parameters, "fasta", true); - if (fastaFile == "not found") { mothurOut("fasta is a required parameter for the screen.seqs command."); mothurOutEndLine(); abort = true; } + if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; } else if (fastaFile == "not open") { abort = true; } - + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = ""; + outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it + } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -65,9 +105,21 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } convert(temp, maxLength); + + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } + convert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } + temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } - else if(temp == "not open") { abort = 0; } + else if(temp == "not open") { abort = true; } else { qFileName = temp; } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } @@ -82,23 +134,26 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = isTrue(temp); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } + convert(temp, processors); + if(allFiles && oligoFile == ""){ - mothurOut("You selected allfiles, but didn't enter an oligos file. Ignoring the allfiles request."); mothurOutEndLine(); + m->mothurOut("You selected allfiles, but didn't enter an oligos file. Ignoring the allfiles request."); m->mothurOutEndLine(); } if((qAverage != 0 && qThreshold != 0) && qFileName == ""){ - mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); mothurOutEndLine(); + m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine(); qAverage=0; qThreshold=0; } if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){ - mothurOut("You didn't set any options... quiting command."); mothurOutEndLine(); + m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine(); abort = true; } } } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); + m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); exit(1); } } @@ -106,30 +161,33 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ void TrimSeqsCommand::help(){ try { - mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); - mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, qtrim and allfiles.\n"); - mothurOut("The fasta parameter is required.\n"); - mothurOut("The flip parameter .... The default is 0.\n"); - mothurOut("The oligos parameter .... The default is "".\n"); - mothurOut("The maxambig parameter .... The default is -1.\n"); - mothurOut("The maxhomop parameter .... The default is 0.\n"); - mothurOut("The minlength parameter .... The default is 0.\n"); - mothurOut("The maxlength parameter .... The default is 0.\n"); - mothurOut("The qfile parameter .....\n"); - mothurOut("The qthreshold parameter .... The default is 0.\n"); - mothurOut("The qaverage parameter .... The default is 0.\n"); - mothurOut("The allfiles parameter .... The default is F.\n"); - mothurOut("The qtrim parameter .... The default is F.\n"); - mothurOut("The trim.seqs command should be in the following format: \n"); - mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n"); - mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"); - mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"); - mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); - mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n"); + m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); + m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n"); + m->mothurOut("The fasta parameter is required.\n"); + m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"); + m->mothurOut("The oligos parameter .... The default is "".\n"); + m->mothurOut("The maxambig parameter .... The default is -1.\n"); + m->mothurOut("The maxhomop parameter .... The default is 0.\n"); + m->mothurOut("The minlength parameter .... The default is 0.\n"); + m->mothurOut("The maxlength parameter .... The default is 0.\n"); + m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n"); + m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"); + m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"); + m->mothurOut("The qfile parameter .....\n"); + m->mothurOut("The qthreshold parameter .... The default is 0.\n"); + m->mothurOut("The qaverage parameter .... The default is 0.\n"); + m->mothurOut("The allfiles parameter .... The default is F.\n"); + m->mothurOut("The qtrim parameter .... The default is F.\n"); + m->mothurOut("The trim.seqs command should be in the following format: \n"); + m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n"); + m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"); + m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); + m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n"); } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "help"); + m->errorOut(e, "TrimSeqsCommand", "help"); exit(1); } } @@ -145,62 +203,220 @@ int TrimSeqsCommand::execute(){ try{ if (abort == true) { return 0; } + + numFPrimers = 0; //this needs to be initialized + numRPrimers = 0; + + string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta"; + outputNames.push_back(trimSeqFile); + string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta"; + outputNames.push_back(scrapSeqFile); + string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups"; + + vector fastaFileNames; + if(oligoFile != ""){ + outputNames.push_back(groupFile); + getOligos(fastaFileNames); + } + + if(qFileName != "") { setLines(qFileName, qLines); } - ifstream inFASTA; - openInputFile(fastaFile, inFASTA); + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + int numSeqs; + openInputFile(fastaFile, inFASTA); + getNumSeqs(inFASTA, numSeqs); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); + + for (int j = 0; j < fastaFileNames.size(); j++) { + rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str()); + } + + }else{ + setLines(fastaFile, lines); + if(qFileName == "") { qLines = lines; } + + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames); + + rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str()); + rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str()); + rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str()); + for (int j = 0; j < fastaFileNames.size(); j++) { + rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str()); + } + //append files + for(int i=1;icontrol_pressed) { return 0; } + #else + ifstream inFASTA; + int numSeqs; + openInputFile(fastaFile, inFASTA); + getNumSeqs(inFASTA, numSeqs); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); + + if (m->control_pressed) { return 0; } + #endif + + + for(int i=0;i comboStarts) { + map::iterator itCombo; + for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){ + if(itCombo->second == i){ thisGroup = itCombo->first; combos.erase(itCombo); break; } + } + }else{ thisGroup = groupVector[i]; } + + while(!inFASTA.eof()){ + if(inFASTA.get() == '>'){ + inFASTA >> seqName; + outGroups << seqName << '\t' << thisGroup << endl; + } + while (!inFASTA.eof()) { char c = inFASTA.get(); if (c == 10 || c == 13){ break; } } + } + outGroups.close(); + inFASTA.close(); + } + } + + if (m->control_pressed) { + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "execute"); + exit(1); + } +} + +/**************************************************************************************/ +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector fastaNames, linePair* line, linePair* qline) { + try { ofstream outFASTA; - string trimSeqFile = getRootName(fastaFile) + "trim.fasta"; - openOutputFile(trimSeqFile, outFASTA); + int able = openOutputFile(trimFile, outFASTA); + + ofstream scrapFASTA; + openOutputFile(scrapFile, scrapFASTA); ofstream outGroups; vector fastaFileNames; - if(oligoFile != ""){ - string groupFile = getRootName(fastaFile) + "groups"; - openOutputFile(groupFile, outGroups); - getOligos(fastaFileNames); + if (oligoFile != "") { + openOutputFile(groupFile, outGroups); + for (int i = 0; i < fastaNames.size(); i++) { + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); + #else + fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate)); + #endif + } } - ofstream scrapFASTA; - string scrapSeqFile = getRootName(fastaFile) + "scrap.fasta"; - openOutputFile(scrapSeqFile, scrapFASTA); + ifstream inFASTA; + openInputFile(filename, inFASTA); ifstream qFile; if(qFileName != "") { openInputFile(qFileName, qFile); } - bool success; + qFile.seekg(qline->start); + inFASTA.seekg(line->start); + + for(int i=0;inum;i++){ + + if (m->control_pressed) { + inFASTA.close(); outFASTA.close(); scrapFASTA.close(); if (oligoFile != "") { outGroups.close(); } if(qFileName != "") { qFile.close(); } + for(int i=0;iclose(); delete fastaFileNames[i]; } + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + + int success = 1; - while(!inFASTA.eof()){ Sequence currSeq(inFASTA); + cout << i << '\t' << currSeq.getName() << endl; + string origSeq = currSeq.getUnaligned(); if (origSeq != "") { - int group; + int groupBar, groupPrime; string trashCode = ""; + int currentSeqsDiffs = 0; if(qFileName != ""){ if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } - if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { + + if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap } + if(!success) { trashCode += 'q'; } } + if(barcodes.size() != 0){ - - success = stripBarcode(currSeq, group); - if(!success){ trashCode += 'b'; } + success = stripBarcode(currSeq, groupBar); + if(success > bdiffs){ trashCode += 'b'; } + else{ currentSeqsDiffs += success; } } + if(numFPrimers != 0){ - success = stripForward(currSeq); - if(!success){ trashCode += 'f'; } + success = stripForward(currSeq, groupPrime); + if(success > pdiffs){ trashCode += 'f'; } + else{ currentSeqsDiffs += success; } } + + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + if(numRPrimers != 0){ success = stripReverse(currSeq); if(!success){ trashCode += 'r'; } } + if(minLength > 0 || maxLength > 0){ success = cullLength(currSeq); - if ((currSeq.getUnaligned().length() > 300) && (success)) { cout << "too long " << currSeq.getUnaligned().length() << endl; } if(!success){ trashCode += 'l'; } } if(maxHomoP > 0){ @@ -215,28 +431,39 @@ int TrimSeqsCommand::execute(){ if(flip){ currSeq.reverseComplement(); } // should go last if(trashCode.length() == 0){ - currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed. + currSeq.setAligned(currSeq.getUnaligned()); currSeq.printSequence(outFASTA); if(barcodes.size() != 0){ - outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; + string thisGroup = groupVector[groupBar]; + int indexToFastaFile = groupBar; + if (primers.size() != 0){ + //does this primer have a group + if (groupVector[groupPrime] != "") { + thisGroup += "." + groupVector[groupPrime]; + indexToFastaFile = combos[thisGroup]; + } + } + outGroups << currSeq.getName() << '\t' << thisGroup << endl; if(allFiles){ - currSeq.printSequence(*fastaFileNames[group]); + currSeq.printSequence(*fastaFileNames[indexToFastaFile]); } } } else{ currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); + currSeq.setAligned(origSeq); currSeq.printSequence(scrapFASTA); } } gobble(inFASTA); } + inFASTA.close(); outFASTA.close(); scrapFASTA.close(); - outGroups.close(); + if (oligoFile != "") { outGroups.close(); } if(qFileName != "") { qFile.close(); } for(int i=0;i'){ - inFASTA >> seqName; - outGroups << seqName << '\t' << groupVector[i] << endl; - } - while (!inFASTA.eof()) { char c = inFASTA.get(); if (c == 10 || c == 13){ break; } } - } - outGroups.close(); - inFASTA.close(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim"); + exit(1); + } +} +/**************************************************************************************************/ +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string groupFile, vector fastaNames) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + int exitCommand = 1; + processIDS.clear(); + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, lines[process], qLines[process]); + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } + //force parent to wait until all the processes are done + for (int i=0;ierrorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim"); exit(1); } } +/**************************************************************************************************/ + +int TrimSeqsCommand::setLines(string filename, vector& lines) { + try { + + lines.clear(); + + vector positions; + + ifstream inFASTA; + openInputFile(filename, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + int numFastaSeqs = positions.size(); + + FILE * pFile; + long size; + + //get num bytes in file + pFile = fopen (filename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + int numSeqsPerProcessor = numFastaSeqs / processors; + + for (int i = 0; i < processors; i++) { + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; + }else{ + long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + + return numFastaSeqs; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "setLines"); + exit(1); + } +} //*************************************************************************************************************** -void TrimSeqsCommand::getOligos(vector& outFASTAVec){ +void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vector& outFASTAVec try { ifstream inOligos; openInputFile(oligoFile, inOligos); @@ -281,14 +580,18 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec){ string type, oligo, group; int index=0; + //int indexPrimer = 0; while(!inOligos.eof()){ inOligos >> type; - + if(type[0] == '#'){ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there } else{ + //make type case insensitive + for(int i=0;i> oligo; for(int i=0;i& outFASTAVec){ if(oligo[i] == 'U') { oligo[i] = 'T'; } } - if(type == "forward"){ - forPrimer.push_back(oligo); + if(type == "FORWARD"){ + group = ""; + + // get rest of line in case there is a primer name + while (!inOligos.eof()) { + char c = inOligos.get(); + if (c == 10 || c == 13){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { group += c; } + } + + //check for repeat barcodes + map::iterator itPrime = primers.find(oligo); + if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + primers[oligo]=index; index++; + groupVector.push_back(group); + + if(allFiles){ + if (group != "") { //there is a group for this primer + outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); + outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); + } + } + } - else if(type == "reverse"){ - revPrimer.push_back(oligo); + else if(type == "REVERSE"){ + Sequence oligoRC("reverse", oligo); + oligoRC.reverseComplement(); + revPrimer.push_back(oligoRC.getUnaligned()); } - else if(type == "barcode"){ + else if(type == "BARCODE"){ inOligos >> group; - barcodes[oligo]=index++; + + //check for repeat barcodes + map::iterator itBar = barcodes.find(oligo); + if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + barcodes[oligo]=index; index++; groupVector.push_back(group); if(allFiles){ - outFASTAVec.push_back(new ofstream((getRootName(fastaFile) + group + ".fasta").c_str(), ios::ate)); + outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); + outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); } - } + }else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } + gobble(inOligos); } inOligos.close(); - numFPrimers = forPrimer.size(); + //add in potential combos + if(allFiles){ + comboStarts = outFASTAVec.size()-1; + for (map::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) { + for (map::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) { + if (groupVector[itPrime->second] != "") { //there is a group for this primer + outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(itBar->second) + "." + groupVector[itBar->second] + "." + toString(itPrime->second) + "." + groupVector[itPrime->second] + ".fasta")); + outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(itBar->second) + "." + groupVector[itBar->second] + "." + toString(itPrime->second) + "." + groupVector[itPrime->second] + ".fasta")); + combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1; + } + } + } + } + + numFPrimers = primers.size(); numRPrimers = revPrimer.size(); } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "getOligos"); + m->errorOut(e, "TrimSeqsCommand", "getOligos"); exit(1); } - } - //*************************************************************************************************************** -bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ +int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ try { + string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent + int success = bdiffs + 1; //guilty until proven innocent + //can you find the barcode for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ string oligo = it->first; if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = 0; - break; + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; } if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ group = it->second; seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 1; + success = 0; break; } } + + //if you found the barcode or if you don't want to allow for diffs +// cout << success; + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it +// cout << endl; + + int maxLength = 0; + + Alignment* alignment; + if (barcodes.size() > 0) { + map::iterator it=barcodes.begin(); + + for(it;it!=barcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; +// int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int newStart=0; + int numDiff = countDiffs(oligo, temp); + +// cout << oligo << '\t' << temp << '\t' << numDiff << endl; + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } +// cout << success << endl; + return success; } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "stripBarcode"); + m->errorOut(e, "TrimSeqsCommand", "stripBarcode"); exit(1); } @@ -360,31 +795,116 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ //*************************************************************************************************************** -bool TrimSeqsCommand::stripForward(Sequence& seq){ +int TrimSeqsCommand::stripForward(Sequence& seq, int& group){ try { string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent + int success = pdiffs + 1; //guilty until proven innocent - for(int i=0;i::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; } if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 1; + success = 0; break; } } + + //if you found the barcode or if you don't want to allow for diffs +// cout << success; + if ((pdiffs == 0) || (success == 0)) { return success; } - return success; + else { //try aligning and see if you can find it +// cout << endl; + + int maxLength = 0; + + Alignment* alignment; + if (primers.size() > 0) { + map::iterator it=primers.begin(); + + for(it;it!=primers.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; +// int length = oligo.length(); + + if(rawSequence.length() < maxLength){ + success = pdiffs + 100; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int newStart=0; + int numDiff = countDiffs(oligo, temp); + +// cout << oligo << '\t' << temp << '\t' << numDiff << endl; + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i pdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + return success; + } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "stripForward"); + m->errorOut(e, "TrimSeqsCommand", "stripForward"); exit(1); } } @@ -405,7 +925,7 @@ bool TrimSeqsCommand::stripReverse(Sequence& seq){ } if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){ - seq.setUnaligned(rawSequence.substr(rawSequence.length()-oligo.length())); + seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length())); success = 1; break; } @@ -414,7 +934,7 @@ bool TrimSeqsCommand::stripReverse(Sequence& seq){ } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "stripReverse"); + m->errorOut(e, "TrimSeqsCommand", "stripReverse"); exit(1); } } @@ -435,7 +955,7 @@ bool TrimSeqsCommand::cullLength(Sequence& seq){ } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "cullLength"); + m->errorOut(e, "TrimSeqsCommand", "cullLength"); exit(1); } @@ -454,7 +974,7 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){ return success; } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "cullHomoP"); + m->errorOut(e, "TrimSeqsCommand", "cullHomoP"); exit(1); } @@ -473,7 +993,7 @@ bool TrimSeqsCommand::cullAmbigs(Sequence& seq){ return success; } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "cullAmbigs"); + m->errorOut(e, "TrimSeqsCommand", "cullAmbigs"); exit(1); } @@ -489,7 +1009,7 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ for(int i=0;ierrorOut(e, "TrimSeqsCommand", "compareDNASeq"); exit(1); } } +//*************************************************************************************************************** + +int TrimSeqsCommand::countDiffs(string oligo, string seq){ + try { + + int length = oligo.length(); + int countDiffs = 0; + + for(int i=0;ierrorOut(e, "TrimSeqsCommand", "countDiffs"); + exit(1); + } +} //*************************************************************************************************************** bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ try { +// string rawSequence = seq.getUnaligned(); +// int seqLength; // = rawSequence.length(); +// string name, temp, temp2; +// +// qFile >> name; +// +// //get rest of line +// temp = ""; +// while (!qFile.eof()) { +// char c = qFile.get(); +// if (c == 10 || c == 13){ break; } +// else { temp += c; } +// } +// +// int pos = temp.find("length"); +// if (pos == temp.npos) { m->mothurOut("Cannot find length in qfile for " + seq.getName()); m->mothurOutEndLine(); seqLength = 0; } +// else { +// string tempLength = temp.substr(pos); +// istringstream iss (tempLength,istringstream::in); +// iss >> temp; +// } +// +// splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242 +// convert(temp, seqLength); //converts string to int +// +// if (name.length() != 0) { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); } } + string rawSequence = seq.getUnaligned(); - int seqLength = rawSequence.length(); + int seqLength = seq.getNumBases(); + bool success = 0; //guilty until proven innocent string name; qFile >> name; - if (name.length() != 0) { if(name.substr(1) != seq.getName()) { mothurOut("sequence name mismatch btwn fasta and qual file"); mothurOutEndLine(); } } + if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } + while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } int score; @@ -536,7 +1119,7 @@ bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ for(int i=0;i> score; - if(score <= qThreshold){ + if(score < qThreshold){ end = i; break; } @@ -550,7 +1133,7 @@ bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ return 1; } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); + m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); exit(1); } } @@ -565,7 +1148,7 @@ bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ string name; qFile >> name; - if (name[0] == '>') { if(name.substr(1) != seq.getName()) { mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); mothurOutEndLine(); } } + if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } @@ -584,7 +1167,7 @@ bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ return success; } catch(exception& e) { - errorOut(e, "TrimSeqsCommand", "cullQualAverage"); + m->errorOut(e, "TrimSeqsCommand", "cullQualAverage"); exit(1); } }