X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=6a8afd017c1e88ff1bdbde883608b5eff00b235f;hb=d71a31a60542608595ce1278cc4a3398479cec7f;hp=1cbb91a2da487a7f0aa00a0689d393297ba592c0;hpb=ae57e166b2ed7b475ec3f466106bd76fabadd063;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 1cbb91a..6a8afd0 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -25,9 +25,12 @@ vector TrimSeqsCommand::setParameters(){ CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs); - CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles); + CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward); CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim); CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold); CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage); @@ -64,9 +67,11 @@ string TrimSeqsCommand::getHelpString(){ helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; helpString += "The maxlength parameter allows you to set and maximum sequence length. \n"; - helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; helpString += "The qfile parameter allows you to provide a quality file.\n"; helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n"; helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n"; @@ -75,6 +80,7 @@ string TrimSeqsCommand::getHelpString(){ helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n"; helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n"; helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"; + helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n"; helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n"; helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n"; helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n"; @@ -91,6 +97,29 @@ string TrimSeqsCommand::getHelpString(){ exit(1); } } +//********************************************************************************************************************** +string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ + try { + string outputFileName = ""; + map >::iterator it; + + //is this a type this command creates + it = outputTypes.find(type); + if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } + else { + if (type == "qfile") { outputFileName = "qual"; } + else if (type == "fasta") { outputFileName = "fasta"; } + else if (type == "group") { outputFileName = "groups"; } + else if (type == "name") { outputFileName = "names"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } + } + return outputFileName; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag"); + exit(1); + } +} //********************************************************************************************************************** @@ -203,8 +232,8 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { // ...at some point should added some additional type checking... string temp; temp = validParameter.validFile(parameters, "flip", false); - if (temp == "not found"){ flip = 0; } - else if(m->isTrue(temp)) { flip = 1; } + if (temp == "not found") { flip = 0; } + else { flip = m->isTrue(temp); } temp = validParameter.validFile(parameters, "oligos", true); if (temp == "not found"){ oligoFile = ""; } @@ -213,27 +242,33 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; } - convert(temp, maxAmbig); + m->mothurConvert(temp, maxAmbig); temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; } - convert(temp, maxHomoP); + m->mothurConvert(temp, maxHomoP); temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; } - convert(temp, minLength); + m->mothurConvert(temp, minLength); temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } - convert(temp, maxLength); + m->mothurConvert(temp, maxLength); temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } - convert(temp, bdiffs); + m->mothurConvert(temp, bdiffs); temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } - convert(temp, pdiffs); + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); - temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } - convert(temp, tdiffs); + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); - if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } @@ -246,7 +281,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { else { nameFile = temp; m->setNameFile(nameFile); } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } - convert(temp, qThreshold); + m->mothurConvert(temp, qThreshold); temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; } qtrim = m->isTrue(temp); @@ -274,10 +309,13 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; } + keepforward = m->isTrue(temp); temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - convert(temp, processors); + m->mothurConvert(temp, processors); if(allFiles && (oligoFile == "")){ @@ -292,6 +330,11 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine(); abort = true; } + + if (nameFile == "") { + vector files; files.push_back(fastaFile); + parser.getNameFile(files); + } } } @@ -309,19 +352,21 @@ int TrimSeqsCommand::execute(){ numFPrimers = 0; //this needs to be initialized numRPrimers = 0; + numSpacers = 0; + numLinkers = 0; createGroup = false; vector > fastaFileNames; vector > qualFileNames; vector > nameFileNames; - string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta"; + string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta"); outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); - string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta"; + string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta"); outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile); - string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual"; - string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual"; + string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile"); + string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile"); if (qFileName != "") { outputNames.push_back(trimQualFile); @@ -330,8 +375,8 @@ int TrimSeqsCommand::execute(){ outputTypes["qfile"].push_back(scrapQualFile); } - string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names"; - string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names"; + string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name"); + string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name"); if (nameFile != "") { m->readNames(nameFile, nameMap); @@ -347,31 +392,20 @@ int TrimSeqsCommand::execute(){ if(oligoFile != ""){ createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames); if (createGroup) { - outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; + outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group"); outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } } + + //fills lines and qlines + setLines(fastaFile, qFileName); - vector fastaFilePos; - vector qFilePos; + if(processors == 1){ + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + }else{ + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); + } - setLines(fastaFile, qFileName, fastaFilePos, qFilePos); - - for (int i = 0; i < (fastaFilePos.size()-1); i++) { - lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)])); - if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); } - } - if(qFileName == "") { qLines = lines; } //files with duds - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1){ - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); - }else{ - createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); - } - #else - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); - #endif if (m->control_pressed) { return 0; } @@ -417,7 +451,7 @@ int TrimSeqsCommand::execute(){ m->openInputFile(it->first, in); ofstream out; - string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups"; + string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); m->openOutputFile(thisGroupName, out); @@ -426,6 +460,17 @@ int TrimSeqsCommand::execute(){ Sequence currSeq(in); m->gobble(in); out << currSeq.getName() << '\t' << it->second << endl; + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + out << thisSeqsNames[k] << '\t' << it->second << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } } in.close(); out.close(); @@ -483,7 +528,7 @@ int TrimSeqsCommand::execute(){ /**************************************************************************************/ -int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair* line, linePair* qline) { +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair line, linePair qline) { try { @@ -530,17 +575,17 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string ifstream inFASTA; m->openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); + inFASTA.seekg(line.start); ifstream qFile; if(qFileName != "") { m->openInputFile(qFileName, qFile); - qFile.seekg(qline->start); + qFile.seekg(qline.start); } int count = 0; bool moreSeqs = 1; - TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer); + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer); while (moreSeqs) { @@ -573,14 +618,34 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int barcodeIndex = 0; int primerIndex = 0; + if(numLinkers != 0){ + success = trimOligos.stripLinker(currSeq, currQual); + if(success > ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + + } + if(barcodes.size() != 0){ success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex); if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } + + if(rbarcodes.size() != 0){ + success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + if(numSpacers != 0){ + success = trimOligos.stripSpacer(currSeq, currQual); + if(success > sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + + } + if(numFPrimers != 0){ - success = trimOligos.stripForward(currSeq, currQual, primerIndex); + success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward); if(success > pdiffs) { trashCode += 'f'; } else{ currentSeqsDiffs += success; } } @@ -645,6 +710,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string currQual.printQScores(trimQualFile); } + if(nameFile != ""){ map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } @@ -666,11 +732,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + int numRedundants = 0; if (nameFile != "") { map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { vector thisSeqsNames; m->splitAtChar(itName->second, thisSeqsNames, ','); + numRedundants = thisSeqsNames.size()-1; //we already include ourselves below for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; } @@ -678,8 +746,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } map::iterator it = groupCounts.find(thisGroup); - if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; } - else { groupCounts[it->first]++; } + if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; } + else { groupCounts[it->first] += (1 + numRedundants); } } } @@ -723,9 +791,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string count++; } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = inFASTA.tellg(); - if ((pos == -1) || (pos >= line->end)) { break; } + if ((pos == -1) || (pos >= line.end)) { break; } #else if (inFASTA.eof()) { break; } @@ -758,12 +826,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 1; + + int process = 1; int exitCommand = 1; processIDS.clear(); - //loop through and create all the processes you want +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + //loop through and create all the processes you want while (process != processors) { int pid = fork(); @@ -854,8 +923,105 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName int temp = processIDS[i]; wait(&temp); } - - //append files +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the trimData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; i > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; + + if(allFiles){ + ofstream temp; + + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + if(qFileName != ""){ + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + if(nameFile != ""){ + tempNameFileNames[i][j] += extension; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } + } + } + } + } + + + trimData* tempTrim = new trimData(filename, + qFileName, nameFile, + (trimFASTAFileName+extension), + (scrapFASTAFileName+extension), + (trimQualFileName+extension), + (scrapQualFileName+extension), + (trimNameFileName+extension), + (scrapNameFileName+extension), + (groupFile+extension), + tempFASTAFileNames, + tempPrimerQualFileNames, + tempNameFileNames, + lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m, + pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, + primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, + qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, + minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap); + pDataArray.push_back(tempTrim); + + hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + } + + //parent do my part + ofstream temp; + m->openOutputFile(trimFASTAFileName, temp); temp.close(); + m->openOutputFile(scrapFASTAFileName, temp); temp.close(); + if(qFileName != ""){ + m->openOutputFile(trimQualFileName, temp); temp.close(); + m->openOutputFile(scrapQualFileName, temp); temp.close(); + } + if (nameFile != "") { + m->openOutputFile(trimNameFileName, temp); temp.close(); + m->openOutputFile(scrapNameFileName, temp); temp.close(); + } + + driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]); + processIDS.push_back(processors-1); + + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + for (map::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) { + map::iterator it2 = groupCounts.find(it->first); + if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; } + else { groupCounts[it->first] += it->second; } + } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + +#endif + + + //append files for(int i=0;imothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); @@ -906,6 +1072,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName } } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(createGroup){ ifstream in; string tempFile = filename + toString(processIDS[i]) + ".num.temp"; @@ -918,7 +1085,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName if (tempNum != 0) { while (!in.eof()) { in >> group >> tempNum; m->gobble(in); - + map::iterator it = groupCounts.find(group); if (it == groupCounts.end()) { groupCounts[group] = tempNum; } else { groupCounts[it->first] += tempNum; } @@ -926,11 +1093,10 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName } in.close(); m->mothurRemove(tempFile); } - + #endif } - - return exitCommand; -#endif + + return exitCommand; } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim"); @@ -940,14 +1106,16 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName /**************************************************************************************************/ -int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { +int TrimSeqsCommand::setLines(string filename, string qfilename) { try { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + + vector fastaFilePos; + vector qfileFilePos; + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //set file positions for fasta file fastaFilePos = m->divideFile(filename, processors); - if (qfilename == "") { return processors; } - //get name of first sequence in each chunk map firstSeqNames; for (int i = 0; i < (fastaFilePos.size()-1); i++) { @@ -960,92 +1128,101 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vectoropenInputFile(qfilename, inQual); - - string input; - while(!inQual.eof()){ - input = m->getline(inQual); - - if (input.length() != 0) { - if(input[0] == '>'){ //this is a sequence name line - istringstream nameStream(input); - - string sname = ""; nameStream >> sname; - sname = sname.substr(1); - - map::iterator it = firstSeqNames.find(sname); - - if(it != firstSeqNames.end()) { //this is the start of a new chunk - unsigned long long pos = inQual.tellg(); - qfileFilePos.push_back(pos - input.length() - 1); - firstSeqNames.erase(it); - } - } - } - - if (firstSeqNames.size() == 0) { break; } - } - inQual.close(); - - if (firstSeqNames.size() != 0) { - for (map::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { - m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine(); - } - qFileName = ""; - return processors; - } - - //get last file position of qfile - FILE * pFile; - unsigned long long size; - - //get num bytes in file - pFile = fopen (qfilename.c_str(),"rb"); - if (pFile==NULL) perror ("Error opening file"); - else{ - fseek (pFile, 0, SEEK_END); - size=ftell (pFile); - fclose (pFile); - } - - qfileFilePos.push_back(size); + if(qfilename != "") { + //seach for filePos of each first name in the qfile and save in qfileFilePos + ifstream inQual; + m->openInputFile(qfilename, inQual); + + string input; + while(!inQual.eof()){ + input = m->getline(inQual); + + if (input.length() != 0) { + if(input[0] == '>'){ //this is a sequence name line + istringstream nameStream(input); + + string sname = ""; nameStream >> sname; + sname = sname.substr(1); + + map::iterator it = firstSeqNames.find(sname); + + if(it != firstSeqNames.end()) { //this is the start of a new chunk + unsigned long long pos = inQual.tellg(); + qfileFilePos.push_back(pos - input.length() - 1); + firstSeqNames.erase(it); + } + } + } + + if (firstSeqNames.size() == 0) { break; } + } + inQual.close(); + + + if (firstSeqNames.size() != 0) { + for (map::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine(); + } + qFileName = ""; + return processors; + } + + //get last file position of qfile + FILE * pFile; + unsigned long long size; + + //get num bytes in file + pFile = fopen (qfilename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + qfileFilePos.push_back(size); + } + + for (int i = 0; i < (fastaFilePos.size()-1); i++) { + lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)])); + if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); } + } + if(qfilename == "") { qLines = lines; } //files with duds return processors; #else - - fastaFilePos.push_back(0); qfileFilePos.push_back(0); - //get last file position of fastafile - FILE * pFile; - unsigned long long size; - - //get num bytes in file - pFile = fopen (filename.c_str(),"rb"); - if (pFile==NULL) perror ("Error opening file"); - else{ - fseek (pFile, 0, SEEK_END); - size=ftell (pFile); - fclose (pFile); - } - fastaFilePos.push_back(size); - - //get last file position of fastafile - FILE * qFile; - - //get num bytes in file - qFile = fopen (qfilename.c_str(),"rb"); - if (qFile==NULL) perror ("Error opening file"); - else{ - fseek (qFile, 0, SEEK_END); - size=ftell (qFile); - fclose (qFile); - } - qfileFilePos.push_back(size); - + + if (processors == 1) { //save time + //fastaFilePos.push_back(0); qfileFilePos.push_back(0); + //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000); + lines.push_back(linePair(0, 1000)); + if (qfilename != "") { qLines.push_back(linePair(0, 1000)); } + }else{ + int numFastaSeqs = 0; + fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs); + if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); } + + if (qfilename != "") { + int numQualSeqs = 0; + qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs); + + if (numFastaSeqs != numQualSeqs) { + m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true; + } + } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor)); + if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); } + } + } + if(qfilename == "") { qLines = lines; } //files with duds return 1; #endif @@ -1109,19 +1286,46 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< primerNameVector.push_back(group); } else if(type == "REVERSE"){ - Sequence oligoRC("reverse", oligo); - oligoRC.reverseComplement(); - revPrimer.push_back(oligoRC.getUnaligned()); + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); } else if(type == "BARCODE"){ inOligos >> group; + + //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs + //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info + string temp = ""; + while (!inOligos.eof()) { + char c = inOligos.get(); + if (c == 10 || c == 13){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { temp += c; } + } + //then this is illumina data with 4 columns + if (temp != "") { + string reverseBarcode = reverseOligo(group); //reverse barcode + group = temp; + + //check for repeat barcodes + map::iterator itBar = rbarcodes.find(reverseBarcode); + if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); } + + rbarcodes[reverseBarcode]=indexBarcode; + } + //check for repeat barcodes map::iterator itBar = barcodes.find(oligo); if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } barcodes[oligo]=indexBarcode; indexBarcode++; barcodeNameVector.push_back(group); + }else if(type == "LINKER"){ + linker.push_back(oligo); + }else if(type == "SPACER"){ + spacer.push_back(oligo); } else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } @@ -1213,6 +1417,8 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< } numFPrimers = primers.size(); numRPrimers = revPrimer.size(); + numLinkers = linker.size(); + numSpacers = spacer.size(); bool allBlank = true; for (int i = 0; i < barcodeNameVector.size(); i++) { @@ -1328,6 +1534,46 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){ } } +//********************************************************************/ +string TrimSeqsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "reverseOligo"); + exit(1); + } +} //***************************************************************************************************************