X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimflowscommand.cpp;h=9f603c4bacce57d2785333d86fb2a8098ec6b28a;hb=5b72d1cf3fa48730e5bb70d59cced1e43e1fe424;hp=6d697359bb2c26c4957faa9d981fb7399ac54313;hpb=ae57e166b2ed7b475ec3f466106bd76fabadd063;p=mothur.git diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp index 6d69735..9f603c4 100644 --- a/trimflowscommand.cpp +++ b/trimflowscommand.cpp @@ -17,16 +17,18 @@ vector TrimFlowsCommand::setParameters(){ CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pflow); CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos); CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "",false,false); parameters.push_back(pmaxhomop); - CommandParameter pmaxflows("maxflows", "Number", "", "720", "", "", "",false,false); parameters.push_back(pmaxflows); - CommandParameter pminflows("minflows", "Number", "", "360", "", "", "",false,false); parameters.push_back(pminflows); + CommandParameter pmaxflows("maxflows", "Number", "", "450", "", "", "",false,false); parameters.push_back(pmaxflows); + CommandParameter pminflows("minflows", "Number", "", "450", "", "", "",false,false); parameters.push_back(pminflows); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs); - CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal); CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise); CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles); - CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder); + CommandParameter porder("order", "String", "", "TACG", "", "", "",false,false); parameters.push_back(porder); CommandParameter pfasta("fasta", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pfasta); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -54,7 +56,28 @@ string TrimFlowsCommand::getHelpString(){ exit(1); } } - +//********************************************************************************************************************** +string TrimFlowsCommand::getOutputFileNameTag(string type, string inputName=""){ + try { + string outputFileName = ""; + map >::iterator it; + + //is this a type this command creates + it = outputTypes.find(type); + if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } + else { + if (type == "flow") { outputFileName = "flow"; } + else if (type == "fasta") { outputFileName = "flow.fasta"; } + else if (type == "file") { outputFileName = "flow.files"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } + } + return outputFileName; + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "getOutputFileNameTag"); + exit(1); + } +} //********************************************************************************************************************** TrimFlowsCommand::TrimFlowsCommand(){ @@ -64,6 +87,7 @@ TrimFlowsCommand::TrimFlowsCommand(){ vector tempOutNames; outputTypes["flow"] = tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["file"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand"); @@ -101,6 +125,7 @@ TrimFlowsCommand::TrimFlowsCommand(string option) { vector tempOutNames; outputTypes["flow"] = tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["file"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -149,10 +174,10 @@ TrimFlowsCommand::TrimFlowsCommand(string option) { string temp; temp = validParameter.validFile(parameters, "minflows", false); if (temp == "not found") { temp = "450"; } - convert(temp, minFlows); + m->mothurConvert(temp, minFlows); temp = validParameter.validFile(parameters, "maxflows", false); if (temp == "not found") { temp = "450"; } - convert(temp, maxFlows); + m->mothurConvert(temp, maxFlows); temp = validParameter.validFile(parameters, "oligos", true); @@ -164,28 +189,35 @@ TrimFlowsCommand::TrimFlowsCommand(string option) { else if(m->isTrue(temp)) { fasta = 1; } temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found"){ temp = "9"; } - convert(temp, maxHomoP); + m->mothurConvert(temp, maxHomoP); temp = validParameter.validFile(parameters, "signal", false); if (temp == "not found"){ temp = "0.50"; } - convert(temp, signal); + m->mothurConvert(temp, signal); temp = validParameter.validFile(parameters, "noise", false); if (temp == "not found"){ temp = "0.70"; } - convert(temp, noise); + m->mothurConvert(temp, noise); temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found"){ temp = "0"; } - convert(temp, bdiffs); + m->mothurConvert(temp, bdiffs); temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found"){ temp = "0"; } - convert(temp, pdiffs); + m->mothurConvert(temp, pdiffs); - temp = validParameter.validFile(parameters, "tdiffs", false); - if (temp == "not found"){ int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } - convert(temp, tdiffs); - if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - convert(temp, processors); + m->mothurConvert(temp, processors); flowOrder = validParameter.validFile(parameters, "order", false); if (flowOrder == "not found"){ flowOrder = "TACG"; } @@ -198,11 +230,13 @@ TrimFlowsCommand::TrimFlowsCommand(string option) { numFPrimers = 0; numRPrimers = 0; + numLinkers = 0; + numSpacers = 0; } } catch(exception& e) { - m->errorOut(e, "TrimFlowsCommand", "TrimSeqsCommand"); + m->errorOut(e, "TrimFlowsCommand", "TrimFlowsCommand"); exit(1); } } @@ -214,19 +248,19 @@ int TrimFlowsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim.flow"; + string trimFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "trim." + getOutputFileNameTag("flow"); outputNames.push_back(trimFlowFileName); outputTypes["flow"].push_back(trimFlowFileName); - string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap.flow"; + string scrapFlowFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "scrap." + getOutputFileNameTag("flow");; outputNames.push_back(scrapFlowFileName); outputTypes["flow"].push_back(scrapFlowFileName); - string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.fasta"; + string fastaFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("fasta"); if(fasta){ outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName); } vector flowFilePos; - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) flowFilePos = getFlowFileBreaks(); for (int i = 0; i < (flowFilePos.size()-1); i++) { lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)])); @@ -272,46 +306,48 @@ int TrimFlowsCommand::execute(){ ofstream output; if(allFiles){ - - flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files"; + set namesAlreadyProcessed; + flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("file"); m->openOutputFile(flowFilesFileName, output); for(int i=0;imothurRemove(barcodePrimerComboFileNames[i][j]); - } - else{ - output << barcodePrimerComboFileNames[i][j] << endl; - outputNames.push_back(barcodePrimerComboFileNames[i][j]); - outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]); + if (namesAlreadyProcessed.count(barcodePrimerComboFileNames[i][j]) == 0) { + FILE * pFile; + unsigned long long size; + + //get num bytes in file + pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell(pFile); + fclose (pFile); + } + + if(size < 10){ + m->mothurRemove(barcodePrimerComboFileNames[i][j]); + } + else{ + output << m->getFullPathName(barcodePrimerComboFileNames[i][j]) << endl; + outputNames.push_back(barcodePrimerComboFileNames[i][j]); + outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]); + } + namesAlreadyProcessed.insert(barcodePrimerComboFileNames[i][j]); } } } output.close(); } else{ - flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files"; + flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("file"); m->openOutputFile(flowFilesFileName, output); - output << trimFlowFileName << endl; + output << m->getFullPathName(trimFlowFileName) << endl; output.close(); } - outputTypes["flow.files"].push_back(flowFilesFileName); + outputTypes["file"].push_back(flowFilesFileName); outputNames.push_back(flowFilesFileName); // set fasta file as new current fastafile @@ -376,12 +412,10 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN int count = 0; bool moreSeqs = 1; - TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer); + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); while(moreSeqs) { - //cout << "driver " << count << endl; - - + if (m->control_pressed) { break; } int success = 1; @@ -389,11 +423,9 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN string trashCode = ""; flowData.getNext(flowFile); - //cout << "driver good bit " << flowFile.good() << endl; flowData.capFlows(maxFlows); Sequence currSeq = flowData.getSequence(); - if(!flowData.hasMinFlows(minFlows)){ //screen to see if sequence is of a minimum number of flows success = 0; trashCode += 'l'; @@ -402,12 +434,26 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN int primerIndex = 0; int barcodeIndex = 0; + if(numLinkers != 0){ + success = trimOligos.stripLinker(currSeq); + if(success > ldiffs) { trashCode += 'k'; } + else{ currentSeqDiffs += success; } + + } + if(barcodes.size() != 0){ success = trimOligos.stripBarcode(currSeq, barcodeIndex); if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqDiffs += success; } } + if(numSpacers != 0){ + success = trimOligos.stripSpacer(currSeq); + if(success > sdiffs) { trashCode += 's'; } + else{ currentSeqDiffs += success; } + + } + if(numFPrimers != 0){ success = trimOligos.stripForward(currSeq, primerIndex); if(success > pdiffs) { trashCode += 'f'; } @@ -445,7 +491,7 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN //report progress if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = flowFile.tellg(); if ((pos == -1) || (pos >= line->end)) { break; } @@ -519,9 +565,8 @@ void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ } else if(type == "REVERSE"){ - Sequence oligoRC("reverse", oligo); - oligoRC.reverseComplement(); - revPrimer.push_back(oligoRC.getUnaligned()); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); } else if(type == "BARCODE"){ oligosFile >> group; @@ -532,6 +577,10 @@ void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ barcodes[oligo]=indexBarcode; indexBarcode++; barcodeNameVector.push_back(group); + }else if(type == "LINKER"){ + linker.push_back(oligo); + }else if(type == "SPACER"){ + spacer.push_back(oligo); } else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); @@ -597,6 +646,8 @@ void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ numFPrimers = primers.size(); numRPrimers = revPrimer.size(); + numLinkers = linker.size(); + numSpacers = spacer.size(); } catch(exception& e) { @@ -604,6 +655,47 @@ void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ exit(1); } } +//********************************************************************/ +string TrimFlowsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimFlowsCommand", "reverseOligo"); + exit(1); + } +} + /**************************************************************************************************/ vector TrimFlowsCommand::getFlowFileBreaks() { @@ -686,7 +778,7 @@ int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trim processIDS.clear(); int exitCommand = 1; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; //loop through and create all the processes you want