X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.h;h=780b4c7381cad9732d41b6fae8fc41736f6083e0;hb=43ed0accfbc2852849e104ff7eccdd2c42acd4ec;hp=e4c2e0b6748eb6c795c8a171e1f1e8a6fb380f6f;hpb=d635b39347cd81943ea50de7b813a0a5d743b0c0;p=mothur.git diff --git a/trimseqscommand.h b/trimseqscommand.h index e4c2e0b..780b4c7 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -15,49 +15,48 @@ #include "sequence.hpp" #include "qualityscores.h" #include "groupmap.h" +#include "trimoligos.h" + class TrimSeqsCommand : public Command { public: TrimSeqsCommand(string); TrimSeqsCommand(); - ~TrimSeqsCommand(); - vector getRequiredParameters(); - vector getValidParameters(); - vector getRequiredFiles(); - map > getOutputFiles() { return outputTypes; } - int execute(); - void help(); + ~TrimSeqsCommand(){} + + vector setParameters(); + string getCommandName() { return "trim.seqs"; } + string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; } + string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } private: GroupMap* groupMap; - - struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} - }; - - void getOligos(vector&, vector&); - int stripBarcode(Sequence&, QualityScores&, int&); - int stripForward(Sequence&, QualityScores&, int&); - bool stripReverse(Sequence&, QualityScores&); - + + struct linePair { + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} + linePair() {} + }; + + bool getOligos(vector >&, vector >&, vector >&); bool keepFirstTrim(Sequence&, QualityScores&); bool removeLastTrim(Sequence&, QualityScores&); - bool cullLength(Sequence&); bool cullHomoP(Sequence&); bool cullAmbigs(Sequence&); - bool compareDNASeq(string, string); - int countDiffs(string, string); - map > outputTypes; - bool abort; - string fastaFile, oligoFile, qFileName, groupfile, outputDir; + bool abort, createGroup; + string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir; - bool flip, allFiles, qtrim; - int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, comboStarts; + bool flip, allFiles, qtrim, keepforward; + int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts; int qWindowSize, qWindowStep, keepFirst, removeLast; double qRollAverage, qThreshold, qWindowAverage, qAverage; vector revPrimer, outputNames; @@ -65,16 +64,428 @@ private: map barcodes; vector groupVector; map primers; + vector linker; + vector spacer; map combos; map groupToIndex; - + vector primerNameVector; //needed here? + vector barcodeNameVector; //needed here? + map groupCounts; + map nameMap; + vector processIDS; //processid - vector lines; - vector qLines; + vector lines; + vector qLines; - int driverCreateTrim(string, string, string, string, string, string, string, vector, vector, linePair*, linePair*); - int createProcessesCreateTrim(string, string, string, string, string, string, string, vector, vector); - int setLines(string, string, vector&, vector&); + int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair, linePair); + int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); + int setLines(string, string); }; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct trimData { + unsigned long long start, end; + MothurOut* m; + string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile; + vector > fastaFileNames; + vector > qualFileNames; + vector > nameFileNames; + unsigned long long lineStart, lineEnd, qlineStart, qlineEnd; + bool flip, allFiles, qtrim, keepforward, createGroup; + int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs; + int qWindowSize, qWindowStep, keepFirst, removeLast, count; + double qRollAverage, qThreshold, qWindowAverage, qAverage; + vector revPrimer; + map barcodes; + map primers; + vector linker; + vector spacer; + map combos; + vector primerNameVector; + vector barcodeNameVector; + map groupCounts; + map nameMap; + + trimData(){} + trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, + int pd, int bd, int ld, int sd, int td, map pri, map bar, vector revP, vector li, vector spa, + vector priNameVector, vector barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL, + int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, + int minL, int maxA, int maxH, int maxL, bool fli, map nm) { + filename = fn; + qFileName = qn; + nameFile = nf; + trimFileName = tn; + scrapFileName = sn; + trimQFileName = tqn; + scrapQFileName = sqn; + trimNFileName = tnn; + scrapNFileName = snn; + groupFileName = gn; + fastaFileNames = ffn; + qualFileNames = qfn; + nameFileNames = nfn; + lineStart = lstart; + lineEnd = lend; + qlineStart = qstart; + qlineEnd = qend; + m = mout; + + pdiffs = pd; + bdiffs = bd; + ldiffs = ld; + sdiffs = sd; + tdiffs = td; + barcodes = bar; + primers = pri; numFPrimers = primers.size(); + revPrimer = revP; numRPrimers = revPrimer.size(); + linker = li; numLinkers = linker.size(); + spacer = spa; numSpacers = spacer.size(); + primerNameVector = priNameVector; + barcodeNameVector = barNameVector; + + createGroup = cGroup; + allFiles = aFiles; + keepforward = keepF; + keepFirst = keepfi; + removeLast = removeL; + qWindowStep = WindowStep; + qWindowSize = WindowSize; + qWindowAverage = WindowAverage; + qtrim = trim; + qThreshold = Threshold; + qAverage = Average; + qRollAverage = RollAverage; + minLength = minL; + maxAmbig = maxA; + maxHomoP = maxH; + maxLength = maxL; + flip = fli; + nameMap = nm; + count = 0; + } +}; +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ + trimData* pDataArray; + pDataArray = (trimData*)lpParam; + + try { + ofstream trimFASTAFile; + pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile); + + ofstream scrapFASTAFile; + pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile); + + ofstream trimQualFile; + ofstream scrapQualFile; + if(pDataArray->qFileName != ""){ + pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile); + pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile); + } + + ofstream trimNameFile; + ofstream scrapNameFile; + if(pDataArray->nameFile != ""){ + pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile); + pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile); + } + + + ofstream outGroupsFile; + if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); } + if(pDataArray->allFiles){ + for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file + for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file + if (pDataArray->fastaFileNames[i][j] != "") { + ofstream temp; + pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close(); + if(pDataArray->qFileName != ""){ + pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close(); + } + + if(pDataArray->nameFile != ""){ + pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close(); + } + } + } + } + } + + ifstream inFASTA; + pDataArray->m->openInputFile(pDataArray->filename, inFASTA); + if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { + inFASTA.seekg(0); + }else { //this accounts for the difference in line endings. + inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA); + } + + ifstream qFile; + if(pDataArray->qFileName != "") { + pDataArray->m->openInputFile(pDataArray->qFileName, qFile); + if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) { + qFile.seekg(0); + }else { //this accounts for the difference in line endings. + qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile); + } + } + + + TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); + + pDataArray->count = pDataArray->lineEnd; + for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { + inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); + if (pDataArray->createGroup) { outGroupsFile.close(); } + if(pDataArray->qFileName != ""){ qFile.close(); } + return 0; + } + + int success = 1; + string trashCode = ""; + int currentSeqsDiffs = 0; + + Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA); + + QualityScores currQual; + if(pDataArray->qFileName != ""){ + currQual = QualityScores(qFile); pDataArray->m->gobble(qFile); + } + + string origSeq = currSeq.getUnaligned(); + if (origSeq != "") { + + int barcodeIndex = 0; + int primerIndex = 0; + + if(pDataArray->numLinkers != 0){ + success = trimOligos.stripLinker(currSeq, currQual); + if(success > pDataArray->ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + } + + if(pDataArray->barcodes.size() != 0){ + success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex); + if(success > pDataArray->bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + + if(pDataArray->numSpacers != 0){ + success = trimOligos.stripSpacer(currSeq, currQual); + if(success > pDataArray->sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + + } + + if(pDataArray->numFPrimers != 0){ + success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward); + if(success > pDataArray->pdiffs) { trashCode += 'f'; } + else{ currentSeqsDiffs += success; } + } + + if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; } + + if(pDataArray->numRPrimers != 0){ + success = trimOligos.stripReverse(currSeq, currQual); + if(!success) { trashCode += 'r'; } + } + + if(pDataArray->keepFirst != 0){ + //success = keepFirstTrim(currSeq, currQual); + success = 1; + if(currQual.getName() != ""){ + currQual.trimQScores(-1, pDataArray->keepFirst); + } + currSeq.trim(pDataArray->keepFirst); + } + + if(pDataArray->removeLast != 0){ + //success = removeLastTrim(currSeq, currQual); + success = 0; + int length = currSeq.getNumBases() - pDataArray->removeLast; + + if(length > 0){ + if(currQual.getName() != ""){ + currQual.trimQScores(-1, length); + } + currSeq.trim(length); + success = 1; + } + else{ success = 0; } + + if(!success) { trashCode += 'l'; } + } + + + if(pDataArray->qFileName != ""){ + int origLength = currSeq.getNumBases(); + + if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); } + else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); } + else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); } + else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); } + else { success = 1; } + + //you don't want to trim, if it fails above then scrap it + if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; } + + if(!success) { trashCode += 'q'; } + } + + if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){ + //success = cullLength(currSeq); + int length = currSeq.getNumBases(); + success = 0; //guilty until proven innocent + if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; } + else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; } + else { success = 0; } + + if(!success) { trashCode += 'l'; } + } + if(pDataArray->maxHomoP > 0){ + //success = cullHomoP(currSeq); + int longHomoP = currSeq.getLongHomoPolymer(); + success = 0; //guilty until proven innocent + if(longHomoP <= pDataArray->maxHomoP){ success = 1; } + else { success = 0; } + + if(!success) { trashCode += 'h'; } + } + if(pDataArray->maxAmbig != -1){ + //success = cullAmbigs(currSeq); + int numNs = currSeq.getAmbigBases(); + success = 0; //guilty until proven innocent + if(numNs <= pDataArray->maxAmbig) { success = 1; } + else { success = 0; } + if(!success) { trashCode += 'n'; } + } + + if(pDataArray->flip){ // should go last + currSeq.reverseComplement(); + if(pDataArray->qFileName != ""){ + currQual.flipQScores(); + } + } + + if(trashCode.length() == 0){ + currSeq.setAligned(currSeq.getUnaligned()); + currSeq.printSequence(trimFASTAFile); + + if(pDataArray->qFileName != ""){ + currQual.printQScores(trimQualFile); + } + + if(pDataArray->nameFile != ""){ + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } + else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + + if (pDataArray->createGroup) { + if(pDataArray->barcodes.size() != 0){ + string thisGroup = pDataArray->barcodeNameVector[barcodeIndex]; + if (pDataArray->primers.size() != 0) { + if (pDataArray->primerNameVector[primerIndex] != "") { + if(thisGroup != "") { + thisGroup += "." + pDataArray->primerNameVector[primerIndex]; + }else { + thisGroup = pDataArray->primerNameVector[primerIndex]; + } + } + } + + outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + + if (pDataArray->nameFile != "") { + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { + vector thisSeqsNames; + pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; + } + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + + map::iterator it = pDataArray->groupCounts.find(thisGroup); + if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; } + else { pDataArray->groupCounts[it->first]++; } + + } + } + + if(pDataArray->allFiles){ + ofstream output; + pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output); + currSeq.printSequence(output); + output.close(); + + if(pDataArray->qFileName != ""){ + pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output); + currQual.printQScores(output); + output.close(); + } + + if(pDataArray->nameFile != ""){ + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { + pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output); + output << itName->first << '\t' << itName->second << endl; + output.close(); + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + } + } + else{ + if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } + else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + currSeq.setName(currSeq.getName() + '|' + trashCode); + currSeq.setUnaligned(origSeq); + currSeq.setAligned(origSeq); + currSeq.printSequence(scrapFASTAFile); + if(pDataArray->qFileName != ""){ + currQual.printQScores(scrapQualFile); + } + } + + } + + //report progress + if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); } + + } + //report progress + if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); } + + + inFASTA.close(); + trimFASTAFile.close(); + scrapFASTAFile.close(); + if (pDataArray->createGroup) { outGroupsFile.close(); } + if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction"); + exit(1); + } + } +#endif + + +/**************************************************************************************************/ + #endif