From 09207a049d2ec77f56ac98960c6525be084fa090 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Fri, 24 Feb 2012 08:38:04 -0500 Subject: [PATCH] paralellized trim.seqs for windows. fixed bug in trim.flows that stopped fasta data from being trimmed. --- flowdata.cpp | 3 +- trimflowscommand.cpp | 4 +- trimoligos.cpp | 24 +-- trimseqscommand.cpp | 200 ++++++++++++++++---- trimseqscommand.h | 431 +++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 601 insertions(+), 61 deletions(-) diff --git a/flowdata.cpp b/flowdata.cpp index 6d6c89b..1420f84 100644 --- a/flowdata.cpp +++ b/flowdata.cpp @@ -135,7 +135,8 @@ void FlowData::capFlows(int mF){ try{ maxFlows = mF; - if(endFlow > maxFlows){ endFlow = maxFlows; } + if(endFlow > maxFlows){ endFlow = maxFlows; } + translateFlow(); } catch(exception& e) { diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp index f5ef635..d87a8df 100644 --- a/trimflowscommand.cpp +++ b/trimflowscommand.cpp @@ -381,9 +381,7 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer); while(moreSeqs) { - //cout << "driver " << count << endl; - - + if (m->control_pressed) { break; } int success = 1; diff --git a/trimoligos.cpp b/trimoligos.cpp index cf81870..8c523ce 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -576,13 +576,13 @@ bool TrimOligos::stripReverse(Sequence& seq){ bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){ try { string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent + bool success = ldiffs + 1; //guilty until proven innocent for(int i=0;i fastaFilePos; - vector qFilePos; + //fills lines and qlines + setLines(fastaFile, qFileName); - setLines(fastaFile, qFileName, fastaFilePos, qFilePos); - - for (int i = 0; i < (fastaFilePos.size()-1); i++) { - lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)])); - if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); } - } - if(qFileName == "") { qLines = lines; } //files with duds - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); }else{ createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); } - #else - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); - #endif + //#else + // driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + //#endif if (m->control_pressed) { return 0; } @@ -503,7 +495,7 @@ int TrimSeqsCommand::execute(){ /**************************************************************************************/ -int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair* line, linePair* qline) { +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair line, linePair qline) { try { @@ -550,12 +542,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string ifstream inFASTA; m->openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); + inFASTA.seekg(line.start); ifstream qFile; if(qFileName != "") { m->openInputFile(qFileName, qFile); - qFile.seekg(qline->start); + qFile.seekg(qline.start); } int count = 0; @@ -595,7 +587,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(numLinkers != 0){ success = trimOligos.stripLinker(currSeq, currQual); - if(!success) { trashCode += 'k'; } + if(success > ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + } if(barcodes.size() != 0){ @@ -606,7 +600,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(numSpacers != 0){ success = trimOligos.stripSpacer(currSeq, currQual); - if(!success) { trashCode += 's'; } + if(success > sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + } if(numFPrimers != 0){ @@ -755,7 +751,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) unsigned long long pos = inFASTA.tellg(); - if ((pos == -1) || (pos >= line->end)) { break; } + if ((pos == -1) || (pos >= line.end)) { break; } #else if (inFASTA.eof()) { break; } @@ -788,12 +784,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 1; + + int process = 1; int exitCommand = 1; processIDS.clear(); - //loop through and create all the processes you want +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //loop through and create all the processes you want while (process != processors) { int pid = fork(); @@ -884,8 +881,105 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName int temp = processIDS[i]; wait(&temp); } - - //append files +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the trimData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; i > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; + + if(allFiles){ + ofstream temp; + + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + if(qFileName != ""){ + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + if(nameFile != ""){ + tempNameFileNames[i][j] += extension; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } + } + } + } + } + + + trimData* tempTrim = new trimData(filename, + qFileName, nameFile, + (trimFASTAFileName+extension), + (scrapFASTAFileName+extension), + (trimQualFileName+extension), + (scrapQualFileName+extension), + (trimNameFileName+extension), + (scrapNameFileName+extension), + (groupFile+extension), + tempFASTAFileNames, + tempPrimerQualFileNames, + tempNameFileNames, + lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m, + pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, + primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, + qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, + minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap); + pDataArray.push_back(tempTrim); + + hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + } + + //parent do my part + ofstream temp; + m->openOutputFile(trimFASTAFileName, temp); temp.close(); + m->openOutputFile(scrapFASTAFileName, temp); temp.close(); + if(qFileName != ""){ + m->openOutputFile(trimQualFileName, temp); temp.close(); + m->openOutputFile(scrapQualFileName, temp); temp.close(); + } + if (nameFile != "") { + m->openOutputFile(trimNameFileName, temp); temp.close(); + m->openOutputFile(scrapNameFileName, temp); temp.close(); + } + + driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]); + processIDS.push_back(processors-1); + + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + for (map::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) { + map::iterator it2 = groupCounts.find(it->first); + if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; } + else { groupCounts[it->first] += it->second; } + } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + +#endif + + + //append files for(int i=0;imothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); @@ -936,6 +1030,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName } } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(createGroup){ ifstream in; string tempFile = filename + toString(processIDS[i]) + ".num.temp"; @@ -948,7 +1043,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName if (tempNum != 0) { while (!in.eof()) { in >> group >> tempNum; m->gobble(in); - + map::iterator it = groupCounts.find(group); if (it == groupCounts.end()) { groupCounts[group] = tempNum; } else { groupCounts[it->first] += tempNum; } @@ -956,11 +1051,10 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName } in.close(); m->mothurRemove(tempFile); } - + #endif } - - return exitCommand; -#endif + + return exitCommand; } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim"); @@ -970,8 +1064,12 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName /**************************************************************************************************/ -int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { +int TrimSeqsCommand::setLines(string filename, string qfilename) { try { + + vector fastaFilePos; + vector qfileFilePos; + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) //set file positions for fasta file fastaFilePos = m->divideFile(filename, processors); @@ -1043,13 +1141,47 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vectorsetFilePosFasta(filename, numFastaSeqs); + + if (qfilename != "") { + int numQualSeqs = 0; + qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs); + + if (numFastaSeqs != numQualSeqs) { + m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true; + } + } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor)); + cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl; + if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); } + } + + if(qfilename == "") { qLines = lines; } //files with duds + } return 1; #endif diff --git a/trimseqscommand.h b/trimseqscommand.h index b6050f2..5006ce9 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -15,6 +15,8 @@ #include "sequence.hpp" #include "qualityscores.h" #include "groupmap.h" +#include "trimoligos.h" + class TrimSeqsCommand : public Command { public: @@ -35,13 +37,14 @@ public: private: GroupMap* groupMap; - - struct linePair { - unsigned long long start; - unsigned long long end; - linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} - }; - + + struct linePair { + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} + linePair() {} + }; + bool getOligos(vector >&, vector >&, vector >&); bool keepFirstTrim(Sequence&, QualityScores&); bool removeLastTrim(Sequence&, QualityScores&); @@ -71,12 +74,418 @@ private: map nameMap; vector processIDS; //processid - vector lines; - vector qLines; + vector lines; + vector qLines; - int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair*, linePair*); + int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair, linePair); int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); - int setLines(string, string, vector&, vector&); + int setLines(string, string); +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct trimData { + unsigned long long start, end; + MothurOut* m; + string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile; + vector > fastaFileNames; + vector > qualFileNames; + vector > nameFileNames; + unsigned long long lineStart, lineEnd, qlineStart, qlineEnd; + bool flip, allFiles, qtrim, keepforward, createGroup; + int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs; + int qWindowSize, qWindowStep, keepFirst, removeLast, count; + double qRollAverage, qThreshold, qWindowAverage, qAverage; + vector revPrimer; + map barcodes; + map primers; + vector linker; + vector spacer; + map combos; + vector primerNameVector; + vector barcodeNameVector; + map groupCounts; + map nameMap; + + trimData(){} + trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, + int pd, int bd, int ld, int sd, int td, map pri, map bar, vector revP, vector li, vector spa, + vector priNameVector, vector barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL, + int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, + int minL, int maxA, int maxH, int maxL, bool fli, map nm) { + filename = fn; + qFileName = qn; + nameFile = nf; + trimFileName = tn; + scrapFileName = sn; + trimQFileName = tqn; + scrapQFileName = sqn; + trimNFileName = tnn; + scrapNFileName = snn; + groupFileName = gn; + fastaFileNames = ffn; + qualFileNames = qfn; + nameFileNames = nfn; + lineStart = lstart; + lineEnd = lend; + qlineStart = qstart; + qlineEnd = qend; + m = mout; + + pdiffs = pd; + bdiffs = bd; + ldiffs = ld; + sdiffs = sd; + tdiffs = td; + barcodes = bar; + primers = pri; numFPrimers = primers.size(); + revPrimer = revP; numRPrimers = revPrimer.size(); + linker = li; numLinkers = linker.size(); + spacer = spa; numSpacers = spacer.size(); + primerNameVector = priNameVector; + barcodeNameVector = barNameVector; + + createGroup = cGroup; + allFiles = aFiles; + keepforward = keepF; + keepFirst = keepfi; + removeLast = removeL; + qWindowStep = WindowStep; + qWindowSize = WindowSize; + qWindowAverage = WindowAverage; + qtrim = trim; + qThreshold = Threshold; + qAverage = Average; + qRollAverage = RollAverage; + minLength = minL; + maxAmbig = maxA; + maxHomoP = maxH; + maxLength = maxL; + flip = fli; + nameMap = nm; + count = 0; + } }; +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ + trimData* pDataArray; + pDataArray = (trimData*)lpParam; + + try { + ofstream trimFASTAFile; + pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile); + + ofstream scrapFASTAFile; + pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile); + + ofstream trimQualFile; + ofstream scrapQualFile; + if(pDataArray->qFileName != ""){ + pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile); + pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile); + } + + ofstream trimNameFile; + ofstream scrapNameFile; + if(pDataArray->nameFile != ""){ + pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile); + pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile); + } + + + ofstream outGroupsFile; + if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); } + if(pDataArray->allFiles){ + for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file + for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file + if (pDataArray->fastaFileNames[i][j] != "") { + ofstream temp; + pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close(); + if(pDataArray->qFileName != ""){ + pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close(); + } + + if(pDataArray->nameFile != ""){ + pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close(); + } + } + } + } + } + + ifstream inFASTA; + pDataArray->m->openInputFile(pDataArray->filename, inFASTA); + if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { + inFASTA.seekg(0); + }else { //this accounts for the difference in line endings. + inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA); + } + + ifstream qFile; + if(pDataArray->qFileName != "") { + pDataArray->m->openInputFile(pDataArray->qFileName, qFile); + if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) { + qFile.seekg(0); + }else { //this accounts for the difference in line endings. + qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile); + } + } + + + TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); + + pDataArray->count = pDataArray->lineEnd; + for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { + inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); + if (pDataArray->createGroup) { outGroupsFile.close(); } + if(pDataArray->qFileName != ""){ qFile.close(); } + return 0; + } + + int success = 1; + string trashCode = ""; + int currentSeqsDiffs = 0; + + Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA); + + QualityScores currQual; + if(pDataArray->qFileName != ""){ + currQual = QualityScores(qFile); pDataArray->m->gobble(qFile); + } + + string origSeq = currSeq.getUnaligned(); + if (origSeq != "") { + + int barcodeIndex = 0; + int primerIndex = 0; + + if(pDataArray->numLinkers != 0){ + success = trimOligos.stripLinker(currSeq, currQual); + if(success > pDataArray->ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + } + + if(pDataArray->barcodes.size() != 0){ + success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex); + if(success > pDataArray->bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + + if(pDataArray->numSpacers != 0){ + success = trimOligos.stripSpacer(currSeq, currQual); + if(success > pDataArray->sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + + } + + if(pDataArray->numFPrimers != 0){ + success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward); + if(success > pDataArray->pdiffs) { trashCode += 'f'; } + else{ currentSeqsDiffs += success; } + } + + if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; } + + if(pDataArray->numRPrimers != 0){ + success = trimOligos.stripReverse(currSeq, currQual); + if(!success) { trashCode += 'r'; } + } + + if(pDataArray->keepFirst != 0){ + //success = keepFirstTrim(currSeq, currQual); + success = 1; + if(currQual.getName() != ""){ + currQual.trimQScores(-1, pDataArray->keepFirst); + } + currSeq.trim(pDataArray->keepFirst); + } + + if(pDataArray->removeLast != 0){ + //success = removeLastTrim(currSeq, currQual); + success = 0; + int length = currSeq.getNumBases() - pDataArray->removeLast; + + if(length > 0){ + if(currQual.getName() != ""){ + currQual.trimQScores(-1, length); + } + currSeq.trim(length); + success = 1; + } + else{ success = 0; } + + if(!success) { trashCode += 'l'; } + } + + + if(pDataArray->qFileName != ""){ + int origLength = currSeq.getNumBases(); + + if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); } + else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); } + else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); } + else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); } + else { success = 1; } + + //you don't want to trim, if it fails above then scrap it + if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; } + + if(!success) { trashCode += 'q'; } + } + + if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){ + //success = cullLength(currSeq); + int length = currSeq.getNumBases(); + success = 0; //guilty until proven innocent + if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; } + else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; } + else { success = 0; } + + if(!success) { trashCode += 'l'; } + } + if(pDataArray->maxHomoP > 0){ + //success = cullHomoP(currSeq); + int longHomoP = currSeq.getLongHomoPolymer(); + success = 0; //guilty until proven innocent + if(longHomoP <= pDataArray->maxHomoP){ success = 1; } + else { success = 0; } + + if(!success) { trashCode += 'h'; } + } + if(pDataArray->maxAmbig != -1){ + //success = cullAmbigs(currSeq); + int numNs = currSeq.getAmbigBases(); + success = 0; //guilty until proven innocent + if(numNs <= pDataArray->maxAmbig) { success = 1; } + else { success = 0; } + if(!success) { trashCode += 'n'; } + } + + if(pDataArray->flip){ // should go last + currSeq.reverseComplement(); + if(pDataArray->qFileName != ""){ + currQual.flipQScores(); + } + } + + if(trashCode.length() == 0){ + currSeq.setAligned(currSeq.getUnaligned()); + currSeq.printSequence(trimFASTAFile); + + if(pDataArray->qFileName != ""){ + currQual.printQScores(trimQualFile); + } + + if(pDataArray->nameFile != ""){ + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } + else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + + if (pDataArray->createGroup) { + if(pDataArray->barcodes.size() != 0){ + string thisGroup = pDataArray->barcodeNameVector[barcodeIndex]; + if (pDataArray->primers.size() != 0) { + if (pDataArray->primerNameVector[primerIndex] != "") { + if(thisGroup != "") { + thisGroup += "." + pDataArray->primerNameVector[primerIndex]; + }else { + thisGroup = pDataArray->primerNameVector[primerIndex]; + } + } + } + + outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + + if (pDataArray->nameFile != "") { + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { + vector thisSeqsNames; + pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; + } + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + + map::iterator it = pDataArray->groupCounts.find(thisGroup); + if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; } + else { pDataArray->groupCounts[it->first]++; } + + } + } + + if(pDataArray->allFiles){ + ofstream output; + pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output); + currSeq.printSequence(output); + output.close(); + + if(pDataArray->qFileName != ""){ + pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output); + currQual.printQScores(output); + output.close(); + } + + if(pDataArray->nameFile != ""){ + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { + pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output); + output << itName->first << '\t' << itName->second << endl; + output.close(); + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + } + } + else{ + if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed + map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); + if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } + else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + } + currSeq.setName(currSeq.getName() + '|' + trashCode); + currSeq.setUnaligned(origSeq); + currSeq.setAligned(origSeq); + currSeq.printSequence(scrapFASTAFile); + if(pDataArray->qFileName != ""){ + currQual.printQScores(scrapQualFile); + } + } + + } + + //report progress + if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); } + + } + //report progress + if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); } + + + inFASTA.close(); + trimFASTAFile.close(); + scrapFASTAFile.close(); + if (pDataArray->createGroup) { outGroupsFile.close(); } + if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction"); + exit(1); + } + } +#endif + + +/**************************************************************************************************/ #endif -- 2.39.2