X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=trimseqscommand.h;h=53f04d37965d55c189447c7cbd1c702ac93dc5ac;hp=9ba64c4b62b7c7e87f213472b730a91bd0b37305;hb=615301e57c25e241356a9c2380648d117709458d;hpb=91a27e0483827c06c21c4fe89558923bbfe86573 diff --git a/trimseqscommand.h b/trimseqscommand.h index 9ba64c4..53f04d3 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -14,8 +14,8 @@ #include "command.hpp" #include "sequence.hpp" #include "qualityscores.h" -#include "groupmap.h" #include "trimoligos.h" +#include "counttable.h" class TrimSeqsCommand : public Command { @@ -27,7 +27,9 @@ public: vector setParameters(); string getCommandName() { return "trim.seqs"; } string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getOutputPattern(string); string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; } string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; } @@ -35,16 +37,13 @@ public: void help() { m->mothurOut(getHelpString()); } private: - - GroupMap* groupMap; - struct linePair { unsigned long long start; unsigned long long end; linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} linePair() {} }; - + bool getOligos(vector >&, vector >&, vector >&); bool keepFirstTrim(Sequence&, QualityScores&); bool removeLastTrim(Sequence&, QualityScores&); @@ -54,14 +53,16 @@ private: string reverseOligo(string); bool abort, createGroup; - string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir; + string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir; - bool flip, allFiles, qtrim, keepforward; + bool flip, allFiles, qtrim, keepforward, pairedOligos, reorient; int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts; int qWindowSize, qWindowStep, keepFirst, removeLast; double qRollAverage, qThreshold, qWindowAverage, qAverage; vector revPrimer, outputNames; set filesToRemove; + map pairedBarcodes; + map pairedPrimers; map barcodes; vector groupVector; map primers; @@ -73,13 +74,15 @@ private: vector barcodeNameVector; //needed here? map groupCounts; map nameMap; + map nameCount; //for countfile name -> repCount + map groupMap; //for countfile name -> group vector processIDS; //processid vector lines; vector qLines; - int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair, linePair); - int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); + int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair, linePair); + int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); int setLines(string, string); }; @@ -90,18 +93,19 @@ private: struct trimData { unsigned long long start, end; MothurOut* m; - string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile; + string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile; vector > fastaFileNames; vector > qualFileNames; vector > nameFileNames; unsigned long long lineStart, lineEnd, qlineStart, qlineEnd; - bool flip, allFiles, qtrim, keepforward, createGroup; + bool flip, allFiles, qtrim, keepforward, createGroup, pairedOligos, reorient; int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs; int qWindowSize, qWindowStep, keepFirst, removeLast, count; double qRollAverage, qThreshold, qWindowAverage, qAverage; vector revPrimer; map barcodes; map primers; + map nameCount; vector linker; vector spacer; map combos; @@ -109,22 +113,28 @@ struct trimData { vector barcodeNameVector; map groupCounts; map nameMap; + map groupMap; + map pairedBarcodes; + map pairedPrimers; trimData(){} - trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, - int pd, int bd, int ld, int sd, int td, map pri, map bar, vector revP, vector li, vector spa, + trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, + int pd, int bd, int ld, int sd, int td, map pri, map bar, vector revP, vector li, vector spa, map pbr, map ppr, bool po, vector priNameVector, vector barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL, int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, - int minL, int maxA, int maxH, int maxL, bool fli, map nm) { + int minL, int maxA, int maxH, int maxL, bool fli, bool reo, map nm, map ncount) { filename = fn; qFileName = qn; nameFile = nf; + countfile = cf; trimFileName = tn; scrapFileName = sn; trimQFileName = tqn; scrapQFileName = sqn; trimNFileName = tnn; scrapNFileName = snn; + trimCFileName = tcn; + scrapCFileName = scn; groupFileName = gn; fastaFileNames = ffn; qualFileNames = qfn; @@ -134,6 +144,7 @@ struct trimData { qlineStart = qstart; qlineEnd = qend; m = mout; + nameCount = ncount; pdiffs = pd; bdiffs = bd; @@ -141,6 +152,9 @@ struct trimData { sdiffs = sd; tdiffs = td; barcodes = bar; + pairedPrimers = ppr; + pairedBarcodes = pbr; + pairedOligos = po; primers = pri; numFPrimers = primers.size(); revPrimer = revP; numRPrimers = revPrimer.size(); linker = li; numLinkers = linker.size(); @@ -165,6 +179,7 @@ struct trimData { maxHomoP = maxH; maxLength = maxL; flip = fli; + reorient = reo; nameMap = nm; count = 0; } @@ -199,7 +214,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ ofstream outGroupsFile; - if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); } + if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); } if(pDataArray->allFiles){ for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file @@ -218,6 +233,14 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } } + ofstream trimCountFile; + ofstream scrapCountFile; + if(pDataArray->countfile != ""){ + pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile); + pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile); + if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; } + } + ifstream inFASTA; pDataArray->m->openInputFile(pDataArray->filename, inFASTA); if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { @@ -236,15 +259,37 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } } - - TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); + TrimOligos* trimOligos = NULL; + int numBarcodes = pDataArray->barcodes.size(); + if (pDataArray->pairedOligos) { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->pairedPrimers, pDataArray->pairedBarcodes); numBarcodes = pDataArray->pairedBarcodes.size(); pDataArray->numFPrimers = pDataArray->pairedPrimers.size(); } + else { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); } + + TrimOligos* rtrimOligos = NULL; + if (pDataArray->reorient) { + //create reoriented primer and barcode pairs + map rpairedPrimers, rpairedBarcodes; + for (map::iterator it = pDataArray->pairedPrimers.begin(); it != pDataArray->pairedPrimers.end(); it++) { + oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reversePrimer, rc ForwardPrimer + rpairedPrimers[it->first] = tempPair; + } + for (map::iterator it = pDataArray->pairedBarcodes.begin(); it != pDataArray->pairedBarcodes.end(); it++) { + oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reverseBarcode, rc ForwardBarcode + rpairedBarcodes[it->first] = tempPair; + } + rtrimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size(); + } - pDataArray->count = pDataArray->lineEnd; + pDataArray->count = 0; for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process - if (pDataArray->m->control_pressed) { + if (pDataArray->m->control_pressed) { + delete trimOligos; if (pDataArray->reorient) { delete rtrimOligos; } inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (pDataArray->createGroup) { outGroupsFile.close(); } + if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); } + if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); } + if(pDataArray->qFileName != ""){ qFile.close(); } return 0; } @@ -254,39 +299,43 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ int currentSeqsDiffs = 0; Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA); + Sequence savedSeq(currSeq.getName(), currSeq.getAligned()); - QualityScores currQual; + QualityScores currQual; QualityScores savedQual; if(pDataArray->qFileName != ""){ currQual = QualityScores(qFile); pDataArray->m->gobble(qFile); + savedQual.setName(currQual.getName()); savedQual.setScores(currQual.getScores()); } + string origSeq = currSeq.getUnaligned(); if (origSeq != "") { + pDataArray->count++; int barcodeIndex = 0; int primerIndex = 0; if(pDataArray->numLinkers != 0){ - success = trimOligos.stripLinker(currSeq, currQual); + success = trimOligos->stripLinker(currSeq, currQual); if(success > pDataArray->ldiffs) { trashCode += 'k'; } else{ currentSeqsDiffs += success; } } - if(pDataArray->barcodes.size() != 0){ - success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex); + if(numBarcodes != 0){ + success = trimOligos->stripBarcode(currSeq, currQual, barcodeIndex); if(success > pDataArray->bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } - + if(pDataArray->numSpacers != 0){ - success = trimOligos.stripSpacer(currSeq, currQual); + success = trimOligos->stripSpacer(currSeq, currQual); if(success > pDataArray->sdiffs) { trashCode += 's'; } else{ currentSeqsDiffs += success; } } if(pDataArray->numFPrimers != 0){ - success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward); + success = trimOligos->stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward); if(success > pDataArray->pdiffs) { trashCode += 'f'; } else{ currentSeqsDiffs += success; } } @@ -294,10 +343,48 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; } if(pDataArray->numRPrimers != 0){ - success = trimOligos.stripReverse(currSeq, currQual); + success = trimOligos->stripReverse(currSeq, currQual); if(!success) { trashCode += 'r'; } } + if (pDataArray->reorient && (trashCode != "")) { //if you failed and want to check the reverse + int thisSuccess = 0; + string thisTrashCode = ""; + int thisCurrentSeqsDiffs = 0; + + int thisBarcodeIndex = 0; + int thisPrimerIndex = 0; + + if(numBarcodes != 0){ + thisSuccess = rtrimOligos->stripBarcode(savedSeq, savedQual, thisBarcodeIndex); + if(thisSuccess > pDataArray->bdiffs) { thisTrashCode += 'b'; } + else{ thisCurrentSeqsDiffs += thisSuccess; } + } + + if(pDataArray->numFPrimers != 0){ + thisSuccess = rtrimOligos->stripForward(savedSeq, savedQual, thisPrimerIndex, pDataArray->keepforward); + if(thisSuccess > pDataArray->pdiffs) { thisTrashCode += 'f'; } + else{ thisCurrentSeqsDiffs += thisSuccess; } + } + + if (thisCurrentSeqsDiffs > pDataArray->tdiffs) { thisTrashCode += 't'; } + + if (thisTrashCode == "") { + trashCode = thisTrashCode; + success = thisSuccess; + currentSeqsDiffs = thisCurrentSeqsDiffs; + barcodeIndex = thisBarcodeIndex; + primerIndex = thisPrimerIndex; + savedSeq.reverseComplement(); + currSeq.setAligned(savedSeq.getAligned()); + if(pDataArray->qFileName != ""){ + savedQual.flipQScores(); + currQual.setScores(savedQual.getScores()); + } + }else { trashCode += "(" + thisTrashCode + ")"; } + } + + if(pDataArray->keepFirst != 0){ //success = keepFirstTrim(currSeq, currQual); success = 1; @@ -376,23 +463,11 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } if(trashCode.length() == 0){ - currSeq.setAligned(currSeq.getUnaligned()); - currSeq.printSequence(trimFASTAFile); - - if(pDataArray->qFileName != ""){ - currQual.printQScores(trimQualFile); - } - - if(pDataArray->nameFile != ""){ - map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); - if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } - else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } - } - - if (pDataArray->createGroup) { - if(pDataArray->barcodes.size() != 0){ - string thisGroup = pDataArray->barcodeNameVector[barcodeIndex]; - if (pDataArray->primers.size() != 0) { + string thisGroup = ""; + if (pDataArray->createGroup) { + if(numBarcodes != 0){ + thisGroup = pDataArray->barcodeNameVector[barcodeIndex]; + if (pDataArray->numFPrimers != 0) { if (pDataArray->primerNameVector[primerIndex] != "") { if(thisGroup != "") { thisGroup += "." + pDataArray->primerNameVector[primerIndex]; @@ -401,48 +476,81 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } } } - - outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; - - if (pDataArray->nameFile != "") { - map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); - if (itName != pDataArray->nameMap.end()) { - vector thisSeqsNames; - pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ','); - for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self - outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; - } - }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } - } - - map::iterator it = pDataArray->groupCounts.find(thisGroup); - if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; } - else { pDataArray->groupCounts[it->first]++; } + } + } + + int pos = thisGroup.find("ignore"); + if (pos == string::npos) { + + currSeq.setAligned(currSeq.getUnaligned()); + currSeq.printSequence(trimFASTAFile); + + if(pDataArray->qFileName != ""){ + currQual.printQScores(trimQualFile); + } + + if(pDataArray->nameFile != ""){ + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } + else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + + int numRedundants = 0; + if (pDataArray->countfile != "") { + map::iterator itCount = pDataArray->nameCount.find(currSeq.getName()); + if (itCount != pDataArray->nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + numRedundants = itCount->second-1; + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + } + + if (pDataArray->createGroup) { + if(numBarcodes != 0){ + + if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; } + else { pDataArray->groupMap[currSeq.getName()] = thisGroup; } + + if (pDataArray->nameFile != "") { + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { + vector thisSeqsNames; + pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ','); + numRedundants = thisSeqsNames.size()-1; //we already include ourselves below + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; + } + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + + map::iterator it = pDataArray->groupCounts.find(thisGroup); + if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; } + else { pDataArray->groupCounts[it->first] += (1 + numRedundants); } + + } + } + + if(pDataArray->allFiles){ + ofstream output; + pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output); + currSeq.printSequence(output); + output.close(); - } - } - - if(pDataArray->allFiles){ - ofstream output; - pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output); - currSeq.printSequence(output); - output.close(); - - if(pDataArray->qFileName != ""){ - pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output); - currQual.printQScores(output); - output.close(); - } - - if(pDataArray->nameFile != ""){ - map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); - if (itName != pDataArray->nameMap.end()) { - pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output); - output << itName->first << '\t' << itName->second << endl; - output.close(); - }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } - } - } + if(pDataArray->qFileName != ""){ + pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output); + currQual.printQScores(output); + output.close(); + } + + if(pDataArray->nameFile != ""){ + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { + pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output); + output << itName->first << '\t' << itName->second << endl; + output.close(); + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + } + } } else{ if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed @@ -450,6 +558,12 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } } + if (pDataArray->countfile != "") { + map::iterator itCount = pDataArray->nameCount.find(currSeq.getName()); + if (itCount != pDataArray->nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + } currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); @@ -462,13 +576,14 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } //report progress - if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); } + if((pDataArray->count) % 1000 == 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); } } //report progress if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); } - + if (pDataArray->reorient) { delete rtrimOligos; } + delete trimOligos; inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();