From 3e8b80da722e11c72bce957e2f42a6e884dd02b6 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 3 Dec 2013 14:51:46 -0500 Subject: [PATCH] added oligos, group, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs to fastq.info so you can parse fastq files by sample. added group file to sffinfo so you can parse by group. added parsing to sra command. --- commandfactory.cpp | 7 +- makecontigscommand.cpp | 1 - parsefastaqcommand.cpp | 703 ++++++++++++++++++++++++++++++++++++++--- parsefastaqcommand.h | 36 ++- sffinfocommand.cpp | 350 +++++++++++++++----- sffinfocommand.h | 15 +- sracommand.cpp | 227 ++++++++++++- sracommand.h | 8 +- 8 files changed, 1188 insertions(+), 159 deletions(-) diff --git a/commandfactory.cpp b/commandfactory.cpp index c247f90..5a32ac1 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -148,6 +148,7 @@ #include "makelefsecommand.h" #include "lefsecommand.h" #include "kruskalwalliscommand.h" +#include "sracommand.h" /*******************************************************/ @@ -319,6 +320,7 @@ CommandFactory::CommandFactory(){ commands["make.lefse"] = "make.lefse"; commands["lefse"] = "lefse"; commands["kruskal.wallis"] = "kruskal.wallis"; + commands["sra"] = "sra"; } @@ -533,7 +535,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "make.contigs") { command = new MakeContigsCommand(optionString); } else if(commandName == "load.logfile") { command = new LoadLogfileCommand(optionString); } else if(commandName == "sff.multiple") { command = new SffMultipleCommand(optionString); } - else if(commandName == "classify.rf") { command = new ClassifyRFSharedCommand(optionString); } + else if(commandName == "classify.rf") { command = new ClassifyRFSharedCommand(optionString); } else if(commandName == "filter.shared") { command = new FilterSharedCommand(optionString); } else if(commandName == "primer.design") { command = new PrimerDesignCommand(optionString); } else if(commandName == "get.dists") { command = new GetDistsCommand(optionString); } @@ -546,6 +548,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "make.lefse") { command = new MakeLefseCommand(optionString); } else if(commandName == "lefse") { command = new LefseCommand(optionString); } else if(commandName == "kruskal.wallis") { command = new KruskalWallisCommand(optionString); } + else if(commandName == "sra") { command = new SRACommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -714,6 +717,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "make.lefse") { pipecommand = new MakeLefseCommand(optionString); } else if(commandName == "lefse") { pipecommand = new LefseCommand(optionString); } else if(commandName == "kruskal.wallis") { pipecommand = new KruskalWallisCommand(optionString); } + else if(commandName == "sra") { pipecommand = new SRACommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -868,6 +872,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "make.lefse") { shellcommand = new MakeLefseCommand(); } else if(commandName == "lefse") { shellcommand = new LefseCommand(); } else if(commandName == "kruskal.wallis") { shellcommand = new KruskalWallisCommand(); } + else if(commandName == "sra") { shellcommand = new SRACommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index bf27d26..8796ab2 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -1977,7 +1977,6 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, stri else { uniquePrimers.insert(tempPair); } if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); } } - primers[indexPrimer]=newPrimer; indexPrimer++; primerNameVector.push_back(group); }else if(type == "BARCODE"){ diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp index 051c1df..c3bd393 100644 --- a/parsefastaqcommand.cpp +++ b/parsefastaqcommand.cpp @@ -14,6 +14,13 @@ vector ParseFastaQCommand::setParameters(){ try { CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pfastq); + CommandParameter poligos("oligos", "InputTypes", "", "", "oligosGroup", "none", "none","",false,false); parameters.push_back(poligos); + CommandParameter pgroup("group", "InputTypes", "", "", "oligosGroup", "none", "none","",false,false); parameters.push_back(pgroup); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ppdiffs); + CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pbdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "","fasta",false,false); parameters.push_back(pfasta); CommandParameter pqual("qfile", "Boolean", "", "T", "", "", "","qfile",false,false); parameters.push_back(pqual); CommandParameter ppacbio("pacbio", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppacbio); @@ -35,8 +42,15 @@ string ParseFastaQCommand::getHelpString(){ try { string helpString = ""; helpString += "The fastq.info command reads a fastq file and creates a fasta and quality file.\n"; - helpString += "The fastq.info command parameters are fastq, fasta, qfile and format; fastq is required.\n"; + helpString += "The fastq.info command parameters are fastq, fasta, qfile, oligos, group and format; fastq is required.\n"; helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n"; + helpString += "The oligos parameter allows you to provide an oligos file to split your fastq file into separate fastq files by barcode and primers. \n"; + helpString += "The group parameter allows you to provide a group file to split your fastq file into separate fastq files by group. \n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the reads. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; + helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=sanger.\n"; helpString += "The fasta parameter allows you to indicate whether you want a fasta file generated. Default=T.\n"; helpString += "The qfile parameter allows you to indicate whether you want a quality file generated. Default=T.\n"; @@ -56,7 +70,8 @@ string ParseFastaQCommand::getOutputPattern(string type) { string pattern = ""; if (type == "fasta") { pattern = "[filename],fasta"; } - else if (type == "qfile") { pattern = "[filename],qual"; } + else if (type == "qfile") { pattern = "[filename],qual"; } + else if (type == "fastq") { pattern = "[filename],[group],fastq"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -74,6 +89,7 @@ ParseFastaQCommand::ParseFastaQCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["fastq"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ParseFastaQCommand", "ParseFastaQCommand"); @@ -83,7 +99,8 @@ ParseFastaQCommand::ParseFastaQCommand(){ //********************************************************************************************************************** ParseFastaQCommand::ParseFastaQCommand(string option){ try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; + split = 1; if(option == "help") { help(); abort = true; calledHelp = true; } else if(option == "citation") { citation(); abort = true; calledHelp = true;} @@ -106,6 +123,7 @@ ParseFastaQCommand::ParseFastaQCommand(string option){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["fastq"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -119,12 +137,40 @@ ParseFastaQCommand::ParseFastaQCommand(string option){ //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["fastq"] = inputDir + it->second; } } + + it = parameters.find("oligos"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["oligos"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } } //check for required parameters fastaQFile = validParameter.validFile(parameters, "fastq", true); if (fastaQFile == "not found") { m->mothurOut("fastq is a required parameter for the fastq.info command."); m->mothurOutEndLine(); abort = true; } - else if (fastaQFile == "not open") { fastaQFile = ""; abort = true; } + else if (fastaQFile == "not open") { fastaQFile = ""; abort = true; } + + oligosfile = validParameter.validFile(parameters, "oligos", true); + if (oligosfile == "not found") { oligosfile = ""; } + else if (oligosfile == "not open") { oligosfile = ""; abort = true; } + else { m->setOligosFile(oligosfile); split = 2; } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not found") { groupfile = ""; } + else if (groupfile == "not open") { groupfile = ""; abort = true; } + else { m->setGroupFile(groupfile); split = 2; } + + if ((groupfile != "") && (oligosfile != "")) { m->mothurOut("You must enter ONLY ONE of the following: oligos or group."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastaQFile); } @@ -139,6 +185,23 @@ ParseFastaQCommand::ParseFastaQCommand(string option){ temp = validParameter.validFile(parameters, "pacbio", false); if(temp == "not found"){ temp = "F"; } pacbio = m->isTrue(temp); + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } + format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "sanger"; } @@ -171,6 +234,18 @@ int ParseFastaQCommand::execute(){ if (fasta) { m->openOutputFile(fastaFile, outFasta); outputNames.push_back(fastaFile); outputTypes["fasta"].push_back(fastaFile); } if (qual) { m->openOutputFile(qualFile, outQual); outputNames.push_back(qualFile); outputTypes["qfile"].push_back(qualFile); } + + TrimOligos* trimOligos = NULL; + int numBarcodes, numPrimers; numBarcodes = 0; numPrimers = 0; + if (oligosfile != "") { + readOligos(oligosfile); + numPrimers = primers.size(); numBarcodes = barcodes.size(); + //find group read belongs to + if (pairedOligos) { trimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, pairedPrimers, pairedBarcodes); numBarcodes = pairedBarcodes.size(); numPrimers = pairedPrimers.size(); } + else { trimOligos = new TrimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); } + + } + else if (groupfile != "") { readGroup(groupfile); } ifstream in; m->openInputFile(fastaQFile, in); @@ -181,65 +256,100 @@ int ParseFastaQCommand::execute(){ convertTable.push_back(temp); } + + int count = 0; while (!in.eof()) { if (m->control_pressed) { break; } - - //read sequence name - string name = m->getline(in); m->gobble(in); - if (name == "") { m->mothurOut("[ERROR]: Blank fasta name."); m->mothurOutEndLine(); m->control_pressed = true; break; } - else if (name[0] != '@') { m->mothurOut("[ERROR]: reading " + name + " expected a name with @ as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; break; } - else { - name = name.substr(1); - m->checkName(name); - } - - //read sequence - string sequence = m->getline(in); m->gobble(in); - if (sequence == "") { m->mothurOut("[ERROR]: missing sequence for " + name); m->mothurOutEndLine(); m->control_pressed = true; break; } - - //read sequence name - string name2 = m->getline(in); m->gobble(in); - if (name2 == "") { m->mothurOut("[ERROR]: Blank quality name."); m->mothurOutEndLine(); m->control_pressed = true; break; } - else if (name2[0] != '+') { m->mothurOut("[ERROR]: reading " + name2 + " expected a name with + as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; break; } - else { - name2 = name2.substr(1); - m->checkName(name2); - } - - //read quality scores - string quality = m->getline(in); m->gobble(in); - if (quality == "") { m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; break; } - - //sanity check sequence length and number of quality scores match - if (name2 != "") { if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; break; } } - if (quality.length() != sequence.length()) { m->mothurOut("[ERROR]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; break; } - - vector qualScores; - if (qual) { - qualScores = convertQual(quality); - outQual << ">" << name << endl; - for (int i = 0; i < qualScores.size(); i++) { outQual << qualScores[i] << " "; } - outQual << endl; - } - if (m->control_pressed) { break; } + bool ignore; + fastqRead2 thisRead = readFastq(in, ignore); - if (pacbio) { - if (!qual) { qualScores = convertQual(quality); } //get scores if we didn't already - for (int i = 0; i < qualScores.size(); i++) { - if (qualScores[i] == 0){ sequence[i] = 'N'; } + if (!ignore) { + vector qualScores; + if (qual) { + qualScores = convertQual(thisRead.quality); + outQual << ">" << thisRead.seq.getName() << endl; + for (int i = 0; i < qualScores.size(); i++) { outQual << qualScores[i] << " "; } + outQual << endl; } - } - - //print sequence info to files - if (fasta) { outFasta << ">" << name << endl << sequence << endl; } - + + if (m->control_pressed) { break; } + + if (pacbio) { + if (!qual) { qualScores = convertQual(thisRead.quality); } //convert if not done + string sequence = thisRead.seq.getAligned(); + for (int i = 0; i < qualScores.size(); i++) { + if (qualScores[i] == 0){ sequence[i] = 'N'; } + } + thisRead.seq.setAligned(sequence); + } + + //print sequence info to files + if (fasta) { thisRead.seq.printSequence(outFasta); } + + if (split > 1) { + int barcodeIndex, primerIndex, trashCodeLength; + if (oligosfile != "") { trashCodeLength = findGroup(thisRead, barcodeIndex, primerIndex, trimOligos, numBarcodes, numPrimers); } + else if (groupfile != "") { trashCodeLength = findGroup(thisRead, barcodeIndex, primerIndex, "groupMode"); } + else { m->mothurOut("[ERROR]: uh oh, we shouldn't be here...\n"); } + + if(trashCodeLength == 0){ + ofstream out; + m->openOutputFileAppend(fastqFileNames[barcodeIndex][primerIndex], out); + out << thisRead.wholeRead; + out.close(); + }else{ + ofstream out; + m->openOutputFileAppend(noMatchFile, out); + out << thisRead.wholeRead; + out.close(); + } + } + //report progress + if((count+1) % 10000 == 0){ m->mothurOut(toString(count+1)); m->mothurOutEndLine(); } + if(count > 100000){ break; } + count++; + } } in.close(); if (fasta) { outFasta.close(); } if (qual) { outQual.close(); } + + //report progress + if (!m->control_pressed) { if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } + + if (split > 1) { + + if (groupfile != "") { delete groupMap; } + else if (oligosfile != "") { delete trimOligos; } + + map::iterator it; + set namesToRemove; + for(int i=0;iisBlank(fastqFileNames[i][j])){ + m->mothurRemove(fastqFileNames[i][j]); + namesToRemove.insert(fastqFileNames[i][j]); + } + } + } + } + } + + //remove names for outputFileNames, just cleans up the output + for(int i = 0; i < outputNames.size(); i++) { + if (namesToRemove.count(outputNames[i]) != 0) { + outputNames.erase(outputNames.begin()+i); + i--; + } + } + if(m->isBlank(noMatchFile)){ m->mothurRemove(noMatchFile); } + else { outputNames.push_back(noMatchFile); outputTypes["fastq"].push_back(noMatchFile); } + } if (m->control_pressed) { outputTypes.clear(); outputNames.clear(); m->mothurRemove(fastaFile); m->mothurRemove(qualFile); return 0; } @@ -267,6 +377,55 @@ int ParseFastaQCommand::execute(){ exit(1); } } +//********************************************************************************************************************** +fastqRead2 ParseFastaQCommand::readFastq(ifstream& in, bool& ignore){ + try { + ignore = false; + string wholeRead = ""; + + //read sequence name + string line = m->getline(in); m->gobble(in); if (split > 1) { wholeRead += line + "\n"; } + vector pieces = m->splitWhiteSpace(line); + string name = ""; if (pieces.size() != 0) { name = pieces[0]; } + if (name == "") { m->mothurOut("[WARNING]: Blank fasta name, ignoring read."); m->mothurOutEndLine(); ignore=true; } + else if (name[0] != '@') { m->mothurOut("[WARNING]: reading " + name + " expected a name with @ as a leading character, ignoring read."); m->mothurOutEndLine(); ignore=true; } + else { name = name.substr(1); } + + //read sequence + string sequence = m->getline(in); m->gobble(in); if (split > 1) { wholeRead += sequence + "\n"; } + if (sequence == "") { m->mothurOut("[WARNING]: missing sequence for " + name + ", ignoring."); ignore=true; } + + //read sequence name + line = m->getline(in); m->gobble(in); if (split > 1) { wholeRead += line + "\n"; } + pieces = m->splitWhiteSpace(line); + string name2 = ""; if (pieces.size() != 0) { name2 = pieces[0]; } + if (name2 == "") { m->mothurOut("[WARNING]: expected a name with + as a leading character, ignoring."); ignore=true; } + else if (name2[0] != '+') { m->mothurOut("[WARNING]: reading " + name2 + " expected a name with + as a leading character, ignoring."); ignore=true; } + else { name2 = name2.substr(1); if (name2 == "") { name2 = name; } } + + + //read quality scores + string quality = m->getline(in); m->gobble(in); if (split > 1) { wholeRead += quality + "\n"; } + if (quality == "") { m->mothurOut("[WARNING]: missing quality for " + name2 + ", ignoring."); ignore=true; } + + //sanity check sequence length and number of quality scores match + if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } } + if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; } + + m->checkName(name); + Sequence seq(name, sequence); + fastqRead2 read(seq, quality, wholeRead); + + if (m->debug) { m->mothurOut("[DEBUG]: " + read.seq.getName() + " " + read.seq.getAligned() + " " + quality + "\n"); } + + return read; + } + catch(exception& e) { + m->errorOut(e, "ParseFastaQCommand", "readFastq"); + exit(1); + } +} + //********************************************************************************************************************** vector ParseFastaQCommand::convertQual(string qual) { try { @@ -302,6 +461,446 @@ vector ParseFastaQCommand::convertQual(string qual) { } } //********************************************************************************************************************** +int ParseFastaQCommand::findGroup(fastqRead2 thisRead, int& barcode, int& primer, TrimOligos*& trimOligos, int numBarcodes, int numPrimers) { + try { + int success = 1; + string trashCode = ""; + int currentSeqsDiffs = 0; + + Sequence currSeq(thisRead.seq.getName(), thisRead.seq.getAligned()); + QualityScores currQual; currQual.setScores(convertQual(thisRead.quality)); + + if(linker.size() != 0){ + success = trimOligos->stripLinker(currSeq, currQual); + if(success > ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + + } + + if(numBarcodes != 0){ + success = trimOligos->stripBarcode(currSeq, currQual, barcode); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + + if(spacer.size() != 0){ + success = trimOligos->stripSpacer(currSeq, currQual); + if(success > sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + + } + + if(numPrimers != 0){ + success = trimOligos->stripForward(currSeq, currQual, primer, true); + if(success > pdiffs) { trashCode += 'f'; } + else{ currentSeqsDiffs += success; } + } + + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + + if(revPrimer.size() != 0){ + success = trimOligos->stripReverse(currSeq, currQual); + if(!success) { trashCode += 'r'; } + } + + + return trashCode.length(); + } + catch(exception& e) { + m->errorOut(e, "ParseFastaQCommand", "findGroup"); + exit(1); + } +} +//********************************************************************************************************************** +int ParseFastaQCommand::findGroup(fastqRead2 thisRead, int& barcode, int& primer, string groupMode) { + try { + string trashCode = ""; + primer = 0; + + string group = groupMap->getGroup(thisRead.seq.getName()); + if (group == "not found") { trashCode += "g"; } //scrap for group + else { //find file group + map::iterator it = barcodes.find(group); + if (it != barcodes.end()) { + barcode = it->second; + }else { trashCode += "g"; } + } + + return trashCode.length(); + } + catch(exception& e) { + m->errorOut(e, "ParseFastaQCommand", "findGroup"); + exit(1); + } +} +//*************************************************************************************************************** + +bool ParseFastaQCommand::readOligos(string oligoFile){ + try { + ifstream inOligos; + m->openInputFile(oligoFile, inOligos); + + string type, oligo, roligo, group; + bool hasPrimer = false; bool hasPairedBarcodes = false; pairedOligos = false; + + int indexPrimer = 0; + int indexBarcode = 0; + int indexPairedPrimer = 0; + int indexPairedBarcode = 0; + set uniquePrimers; + set uniqueBarcodes; + + while(!inOligos.eof()){ + + inOligos >> type; + + if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } + + if(type[0] == '#'){ + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + m->gobble(inOligos); + } + else{ + m->gobble(inOligos); + //make type case insensitive + for(int i=0;i> oligo; + + if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); } + + for(int i=0;i::iterator itPrime = primers.find(oligo); + if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } } + + primers[oligo]=indexPrimer; indexPrimer++; + primerNameVector.push_back(group); + } + else if (type == "PRIMER"){ + m->gobble(inOligos); + + inOligos >> roligo; + + for(int i=0;idebug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); } + + //check for repeat barcodes + string tempPair = oligo+roligo; + if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); } + else { uniquePrimers.insert(tempPair); } + + if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); } } + + pairedPrimers[indexPairedPrimer]=newPrimer; indexPairedPrimer++; + primerNameVector.push_back(group); + hasPrimer = true; + } + else if(type == "REVERSE"){ + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); + } + else if(type == "BARCODE"){ + inOligos >> group; + + //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs + //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info + + string temp = ""; + while (!inOligos.eof()) { + char c = inOligos.get(); + if (c == 10 || c == 13 || c == -1){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { temp += c; } + } + + //then this is illumina data with 4 columns + if (temp != "") { + hasPairedBarcodes = true; + string reverseBarcode = group; //reverseOligo(group); //reverse barcode + group = temp; + + for(int i=0;idebug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); } + //check for repeat barcodes + string tempPair = oligo+reverseBarcode; + if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse + " is in your oligos file already, disregarding."); m->mothurOutEndLine(); } + else { uniqueBarcodes.insert(tempPair); } + + pairedBarcodes[indexPairedBarcode]=newPair; indexPairedBarcode++; + barcodeNameVector.push_back(group); + }else { + //check for repeat barcodes + map::iterator itBar = barcodes.find(oligo); + if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + barcodes[oligo]=indexBarcode; indexBarcode++; + barcodeNameVector.push_back(group); + } + }else if(type == "LINKER"){ + linker.push_back(oligo); + }else if(type == "SPACER"){ + spacer.push_back(oligo); + } + else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } + } + m->gobble(inOligos); + } + inOligos.close(); + + if (hasPairedBarcodes || hasPrimer) { + pairedOligos = true; + if ((primers.size() != 0) || (barcodes.size() != 0) || (linker.size() != 0) || (spacer.size() != 0) || (revPrimer.size() != 0)) { m->control_pressed = true; m->mothurOut("[ERROR]: cannot mix paired primers and barcodes with non paired or linkers and spacers, quitting."); m->mothurOutEndLine(); return 0; } + } + + //add in potential combos + if(barcodeNameVector.size() == 0){ + barcodes[""] = 0; + barcodeNameVector.push_back(""); + } + + if(primerNameVector.size() == 0){ + primers[""] = 0; + primerNameVector.push_back(""); + } + + fastqFileNames.resize(barcodeNameVector.size()); + for(int i=0;i uniqueNames; //used to cleanup outputFileNames + if (pairedOligos) { + for(map::iterator itBar = pairedBarcodes.begin();itBar != pairedBarcodes.end();itBar++){ + for(map::iterator itPrimer = pairedPrimers.begin();itPrimer != pairedPrimers.end(); itPrimer++){ + + string primerName = primerNameVector[itPrimer->first]; + string barcodeName = barcodeNameVector[itBar->first]; + + if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing + else { + string comboGroupName = ""; + string fastqFileName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[itBar->first]; + } + else{ + if(barcodeName == ""){ + comboGroupName = primerNameVector[itPrimer->first]; + } + else{ + comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first]; + } + } + + + ofstream temp; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile)); + variables["[group]"] = comboGroupName; + fastqFileName = getOutputFileName("fastq", variables); + if (uniqueNames.count(fastqFileName) == 0) { + outputNames.push_back(fastqFileName); + outputTypes["fastq"].push_back(fastqFileName); + uniqueNames.insert(fastqFileName); + } + + fastqFileNames[itBar->first][itPrimer->first] = fastqFileName; + m->openOutputFile(fastqFileName, temp); temp.close(); + + } + } + } + }else { + for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ + for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ + + string primerName = primerNameVector[itPrimer->second]; + string barcodeName = barcodeNameVector[itBar->second]; + + if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing + else { + string comboGroupName = ""; + string fastqFileName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[itBar->second]; + } + else{ + if(barcodeName == ""){ + comboGroupName = primerNameVector[itPrimer->second]; + } + else{ + comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second]; + } + } + + + ofstream temp; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile)); + variables["[group]"] = comboGroupName; + fastqFileName = getOutputFileName("fastq", variables); + if (uniqueNames.count(fastqFileName) == 0) { + outputNames.push_back(fastqFileName); + outputTypes["fastq"].push_back(fastqFileName); + uniqueNames.insert(fastqFileName); + } + + fastqFileNames[itBar->second][itPrimer->second] = fastqFileName; + m->openOutputFile(fastqFileName, temp); temp.close(); + + } + } + } + } + + ofstream temp; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile)); + variables["[group]"] = "scrap"; + noMatchFile = getOutputFileName("fastq", variables); + m->openOutputFile(noMatchFile, temp); temp.close(); + + return true; + + } + catch(exception& e) { + m->errorOut(e, "ParseFastaQCommand", "getOligos"); + exit(1); + } +} +//*************************************************************************************************************** +bool ParseFastaQCommand::readGroup(string groupfile){ + try { + fastqFileNames.clear(); + + groupMap = new GroupMap(); + groupMap->readMap(groupfile); + + //like barcodeNameVector - no primer names + vector groups = groupMap->getNamesOfGroups(); + + fastqFileNames.resize(groups.size()); + for (int i = 0; i < fastqFileNames.size(); i++) { + for (int j = 0; j < 1; j++) { + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile)); + variables["[group]"] = groups[i]; + string thisFilename = getOutputFileName("fastq",variables); + outputNames.push_back(thisFilename); + outputTypes["fastq"].push_back(thisFilename); + + ofstream temp; + m->openOutputFileBinary(thisFilename, temp); temp.close(); + fastqFileNames[i].push_back(thisFilename); + barcodes[groups[i]] = i; + } + } + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaQFile)); + variables["[group]"] = "scrap"; + noMatchFile = getOutputFileName("fastq",variables); + m->mothurRemove(noMatchFile); + + return true; + + } + catch(exception& e) { + m->errorOut(e, "ParseFastaQCommand", "readGroup"); + exit(1); + } +} +//********************************************************************/ +string ParseFastaQCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "ParseFastaQCommand", "reverseOligo"); + exit(1); + } +} + + +//********************************************************************************************************************** diff --git a/parsefastaqcommand.h b/parsefastaqcommand.h index 5ae6b9c..2ac1416 100644 --- a/parsefastaqcommand.h +++ b/parsefastaqcommand.h @@ -12,6 +12,20 @@ #include "command.hpp" +#include "trimoligos.h" +#include "sequence.hpp" +#include "groupmap.h" + +struct fastqRead2 { + string quality; + Sequence seq; + string wholeRead; + + fastqRead2() { }; + fastqRead2(Sequence s, string q, string w) : seq(s), quality(q), wholeRead(w){}; + ~fastqRead2() {}; +}; + class ParseFastaQCommand : public Command { @@ -35,11 +49,29 @@ public: private: vector outputNames; - string outputDir, fastaQFile, format; - bool abort, fasta, qual, pacbio; + string outputDir, fastaQFile, format, oligosfile, groupfile; + bool abort, fasta, qual, pacbio, pairedOligos; + int pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, split; + GroupMap* groupMap; + + //oligos file data structures + vector linker, spacer, primerNameVector, barcodeNameVector, revPrimer; + map barcodes; + map primers; + map pairedBarcodes; + map pairedPrimers; + vector > fastqFileNames; + string noMatchFile; vector convertQual(string); vector convertTable; + bool readOligos(string oligosFile); + bool readGroup(string oligosFile); + string reverseOligo(string oligo); + fastqRead2 readFastq(ifstream&, bool&); + int findGroup(fastqRead2, int&, int&, TrimOligos*&, int, int); + int findGroup(fastqRead2, int&, int&, string); + }; #endif diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp index bccb751..03bcebb 100755 --- a/sffinfocommand.cpp +++ b/sffinfocommand.cpp @@ -17,7 +17,8 @@ vector SffInfoCommand::setParameters(){ try { CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(psff); - CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(poligos); + CommandParameter poligos("oligos", "InputTypes", "", "", "oligosGroup", "none", "none","",false,false); parameters.push_back(poligos); + CommandParameter pgroup("group", "InputTypes", "", "", "oligosGroup", "none", "none","",false,false); parameters.push_back(pgroup); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos); CommandParameter psfftxt("sfftxt", "String", "", "", "", "", "","",false,false); parameters.push_back(psfftxt); CommandParameter pflow("flow", "Boolean", "", "T", "", "", "","flow",false,false); parameters.push_back(pflow); @@ -46,11 +47,12 @@ string SffInfoCommand::getHelpString(){ try { string helpString = ""; helpString += "The sffinfo command reads a sff file and extracts the sequence data, or you can use it to parse a sfftxt file.\n"; - helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, oligos, bdiffs, tdiffs, ldiffs, sdiffs, pdiffs and trim. sff is required. \n"; + helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, oligos, group, bdiffs, tdiffs, ldiffs, sdiffs, pdiffs and trim. sff is required. \n"; helpString += "The sff parameter allows you to enter the sff file you would like to extract data from. You may enter multiple files by separating them by -'s.\n"; helpString += "The fasta parameter allows you to indicate if you would like a fasta formatted file generated. Default=True. \n"; helpString += "The qfile parameter allows you to indicate if you would like a quality file generated. Default=True. \n"; helpString += "The oligos parameter allows you to provide an oligos file to split your sff file into separate sff files by barcode. \n"; + helpString += "The group parameter allows you to provide a group file to split your sff file into separate sff files by group. \n"; helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; @@ -112,7 +114,7 @@ SffInfoCommand::SffInfoCommand(){ SffInfoCommand::SffInfoCommand(string option) { try { abort = false; calledHelp = false; - hasAccnos = false; hasOligos = false; + hasAccnos = false; hasOligos = false; hasGroup = false; split = 1; //allow user to run help @@ -293,7 +295,7 @@ SffInfoCommand::SffInfoCommand(string option) { bool ignore = false; if (oligosFileNames[i] == "current") { oligosFileNames[i] = m->getOligosFile(); - if (oligosFileNames[i] != "") { m->mothurOut("Using " + oligosFileNames[i] + " as input file for the accnos parameter where you had given current."); m->mothurOutEndLine(); } + if (oligosFileNames[i] != "") { m->mothurOut("Using " + oligosFileNames[i] + " as input file for the oligos parameter where you had given current."); m->mothurOutEndLine(); } else { m->mothurOut("You have no current oligosfile, ignoring current."); m->mothurOutEndLine(); ignore=true; //erase from file list @@ -349,12 +351,87 @@ SffInfoCommand::SffInfoCommand(string option) { //make sure there is at least one valid file left if (oligosFileNames.size() == 0) { m->mothurOut("no valid oligos files."); m->mothurOutEndLine(); abort = true; } } + + groupfile = validParameter.validFile(parameters, "group", false); + if (groupfile == "not found") { groupfile = ""; } + else { + hasGroup = true; + m->splitAtDash(groupfile, groupFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < groupFileNames.size(); i++) { + bool ignore = false; + if (groupFileNames[i] == "current") { + groupFileNames[i] = m->getGroupFile(); + if (groupFileNames[i] != "") { m->mothurOut("Using " + groupFileNames[i] + " as input file for the group parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current group file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + groupFileNames.erase(groupFileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(groupFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { groupFileNames[i] = inputDir + groupFileNames[i]; } + } + + ifstream in; + int ableToOpen = m->openInputFile(groupFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(groupFileNames[i]); + m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + groupFileNames[i] = tryPath; + } + } + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(groupFileNames[i]); + m->mothurOut("Unable to open " + groupFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + groupFileNames[i] = tryPath; + } + } + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + groupFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + groupFileNames.erase(groupFileNames.begin()+i); + i--; + } + } + } + + //make sure there is at least one valid file left + if (groupFileNames.size() == 0) { m->mothurOut("no valid group files."); m->mothurOutEndLine(); abort = true; } + } - if (hasOligos) { + if (hasGroup) { + split = 2; + if (groupFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each sff file."); m->mothurOutEndLine(); } + } + + if (hasOligos) { split = 2; - if (oligosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a oligos file, you must have one for each sff file."); m->mothurOutEndLine(); } + if (oligosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide an oligos file, you must have one for each sff file."); m->mothurOutEndLine(); } } + if (hasGroup && hasOligos) { m->mothurOut("You must enter ONLY ONE of the following: oligos or group."); m->mothurOutEndLine(); abort = true;} + if (hasAccnos) { if (accnosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a accnos file, you must have one for each sff file."); m->mothurOutEndLine(); } } @@ -442,7 +519,8 @@ int SffInfoCommand::execute(){ string oligos = ""; if (hasOligos) { oligos = oligosFileNames[s]; } - + if (hasGroup) { oligos = groupFileNames[s]; } + int numReads = extractSffInfo(filenames[s], accnos, oligos); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to extract " + toString(numReads) + "."); @@ -490,9 +568,10 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ if (accnos != "") { readAccnosFile(accnos); } else { seqNames.clear(); } - - if (oligos != "") { readOligos(oligos); split = 2; } - + + if (hasOligos) { readOligos(oligos); split = 2; } + if (hasGroup) { readGroup(oligos); split = 2; } + ofstream outSfftxt, outFasta, outQual, outFlow; string outFastaFileName, outQualFileName; string rootName = outputDir + m->getRootName(m->getSimpleName(input)); @@ -526,7 +605,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ //print common header if (sfftxt) { printCommonHeader(outSfftxt, header); } if (flow) { outFlow << header.numFlowsPerRead << endl; } - + //read through the sff file while (!in.eof()) { @@ -551,7 +630,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ } count++; - + //report progress if((count+1) % 10000 == 0){ m->mothurOut(toString(count+1)); m->mothurOutEndLine(); } @@ -574,13 +653,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ //create new common headers for each file with the correct number of reads adjustCommonHeader(header); - //close files and delete ofstreams - for(int i=0;isecond)->close(); delete (filehandles[i][j].begin()->second); - (filehandlesHeaders[i][j].begin()->second)->close(); delete (filehandlesHeaders[i][j].begin()->second); - } - } + if (hasGroup) { delete groupMap; } //cout << "here" << endl; map::iterator it; @@ -588,13 +661,13 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ for(int i=0;ifirst != "") { - if (namesToRemove.count(filehandles[i][j].begin()->first) == 0) { - if(m->isBlank(filehandles[i][j].begin()->first)){ + if (filehandles[i][j] != "") { + if (namesToRemove.count(filehandles[i][j]) == 0) { + if(m->isBlank(filehandles[i][j])){ //cout << i << '\t' << '\t' << j << '\t' << filehandles[i][j] << " is blank removing" << endl; - m->mothurRemove(filehandles[i][j].begin()->first); - m->mothurRemove(filehandlesHeaders[i][j].begin()->first); - namesToRemove.insert(filehandles[i][j].begin()->first); + m->mothurRemove(filehandles[i][j]); + m->mothurRemove(filehandlesHeaders[i][j]); + namesToRemove.insert(filehandles[i][j]); } } } @@ -604,11 +677,11 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ //append new header to reads for (int i = 0; i < filehandles.size(); i++) { for (int j = 0; j < filehandles[i].size(); j++) { - m->appendBinaryFiles(filehandles[i][j].begin()->first, filehandlesHeaders[i][j].begin()->first); - m->renameFile(filehandlesHeaders[i][j].begin()->first, filehandles[i][j].begin()->first); - m->mothurRemove(filehandlesHeaders[i][j].begin()->first); + m->appendBinaryFiles(filehandles[i][j], filehandlesHeaders[i][j]); + m->renameFile(filehandlesHeaders[i][j], filehandles[i][j]); + m->mothurRemove(filehandlesHeaders[i][j]); //cout << i << '\t' << '\t' << j << '\t' << filehandles[i][j] << " done appending headers and removing " << filehandlesHeaders[i][j] << endl; - if (numSplitReads[i][j] == 0) { m->mothurRemove(filehandles[i][j].begin()->first); } + if (numSplitReads[i][j] == 0) { m->mothurRemove(filehandles[i][j]); } } } //cout << "here3" << endl; @@ -733,7 +806,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ in.read(mybuffer,4); for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); } } outNoMatchHeader.write(mybuffer, in.gcount()); @@ -744,7 +820,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ in.read(mybuffer,4); for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); } } outNoMatchHeader.write(mybuffer, in.gcount()); @@ -765,7 +844,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ thisbuffer[7] = offset & 0xFF; for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(thisbuffer, 8); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(thisbuffer, 8); + out.close(); } } outNoMatchHeader.write(thisbuffer, 8); @@ -784,7 +866,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ thisbuffer2[3] = offset & 0xFF; for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(thisbuffer2, 4); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(thisbuffer2, 4); + out.close(); } } outNoMatchHeader.write(thisbuffer2, 4); @@ -809,7 +894,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ thisbuffer[2] = (numSplitReads[i][j] >> 16) & 0xFF; thisbuffer[3] = (numSplitReads[i][j] >> 24) & 0xFF; } - (*(filehandlesHeaders[i][j].begin()->second)).write(thisbuffer, 4); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(thisbuffer, 4); + out.close(); delete[] thisbuffer; } } @@ -834,7 +922,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ in.read(mybuffer,2); for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); } } outNoMatchHeader.write(mybuffer, in.gcount()); @@ -845,7 +936,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ in.read(mybuffer,2); for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); } } outNoMatchHeader.write(mybuffer, in.gcount()); @@ -856,7 +950,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ in.read(mybuffer,2); for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); } } outNoMatchHeader.write(mybuffer, in.gcount()); @@ -867,7 +964,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ in.read(mybuffer,1); for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); } } outNoMatchHeader.write(mybuffer, in.gcount()); @@ -878,7 +978,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ in.read(mybuffer,header.numFlowsPerRead); for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); } } outNoMatchHeader.write(mybuffer, in.gcount()); @@ -889,7 +992,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ in.read(mybuffer,header.keyLength); for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); } } outNoMatchHeader.write(mybuffer, in.gcount()); @@ -904,7 +1010,10 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ mybuffer = new char[spot-spotInFile]; for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, spot-spotInFile); + ofstream out; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, spot-spotInFile); + out.close(); } } outNoMatchHeader.write(mybuffer, spot-spotInFile); @@ -1031,9 +1140,12 @@ bool SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, if (split > 1) { - int barcodeIndex, primerIndex; - int trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex); - + int barcodeIndex, primerIndex, trashCodeLength; + + if (hasOligos) { trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex); } + else if (hasGroup) { trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex, "groupMode"); } + else { m->mothurOut("[ERROR]: uh oh, we shouldn't be here...\n"); } + char * mybuffer; mybuffer = new char [spot-startSpotInFile]; @@ -1044,7 +1156,10 @@ bool SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, if(trashCodeLength == 0){ - (*(filehandles[barcodeIndex][primerIndex].begin()->second)).write(mybuffer, in2.gcount()); + ofstream out; + m->openOutputFileBinaryAppend(filehandles[barcodeIndex][primerIndex], out); + out.write(mybuffer, in2.gcount()); + out.close(); numSplitReads[barcodeIndex][primerIndex]++; } else{ @@ -1152,7 +1267,29 @@ int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& pr m->errorOut(e, "SffInfoCommand", "findGroup"); exit(1); } -} +} +//********************************************************************************************************************** +int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& primer, string groupMode) { + try { + string trashCode = ""; + primer = 0; + + string group = groupMap->getGroup(header.name); + if (group == "not found") { trashCode += "g"; } //scrap for group + else { //find file group + map::iterator it = barcodes.find(group); + if (it != barcodes.end()) { + barcode = it->second; + }else { trashCode += "g"; } + } + + return trashCode.length(); + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "findGroup"); + exit(1); + } +} //********************************************************************************************************************** int SffInfoCommand::decodeName(string& timestamp, string& region, string& xy, string name) { try { @@ -1685,7 +1822,7 @@ bool SffInfoCommand::readOligos(string oligoFile){ while(!inOligos.eof()){ - inOligos >> type; + inOligos >> type; if(type[0] == '#'){ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there @@ -1707,19 +1844,20 @@ bool SffInfoCommand::readOligos(string oligoFile){ group = ""; // get rest of line in case there is a primer name - while (!inOligos.eof()) { - char c = inOligos.get(); + while (!inOligos.eof()) { + char c = inOligos.get(); if (c == 10 || c == 13 || c == -1){ break; } else if (c == 32 || c == 9){;} //space or tab else { group += c; } - } + } //check for repeat barcodes map::iterator itPrime = primers.find(oligo); if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } - primers[oligo]=indexPrimer; indexPrimer++; + primers[oligo]=indexPrimer; indexPrimer++; primerNameVector.push_back(group); + }else if(type == "REVERSE"){ //Sequence oligoRC("reverse", oligo); //oligoRC.reverseComplement(); @@ -1728,6 +1866,7 @@ bool SffInfoCommand::readOligos(string oligoFile){ } else if(type == "BARCODE"){ inOligos >> group; + //check for repeat barcodes map::iterator itBar = barcodes.find(oligo); @@ -1743,31 +1882,27 @@ bool SffInfoCommand::readOligos(string oligoFile){ else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } m->gobble(inOligos); - } + } inOligos.close(); - + if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ split = 1; } - + //add in potential combos if(barcodeNameVector.size() == 0){ barcodes[""] = 0; - barcodeNameVector.push_back(""); + barcodeNameVector.push_back(""); } if(primerNameVector.size() == 0){ primers[""] = 0; - primerNameVector.push_back(""); + primerNameVector.push_back(""); } filehandles.resize(barcodeNameVector.size()); - for (int i = 0; i < filehandles.size(); i++) { - for (int j = 0; j < primerNameVector.size(); j++) { - ofstream* temp; - map myMap; myMap[""] = temp; - filehandles[i].push_back(myMap); - } - } - + for(int i=0;i 1){ set uniqueNames; //used to cleanup outputFileNames for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ @@ -1793,8 +1928,8 @@ bool SffInfoCommand::readOligos(string oligoFile){ } } - ofstream* temp = new ofstream; - map variables; + ofstream temp; + map variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName)); variables["[group]"] = comboGroupName; string thisFilename = getOutputFileName("sff",variables); @@ -1804,24 +1939,22 @@ bool SffInfoCommand::readOligos(string oligoFile){ uniqueNames.insert(thisFilename); } - map myMap; myMap[thisFilename] = temp; - m->openOutputFileBinary(thisFilename, *(temp)); - filehandles[itBar->second][itPrimer->second] = myMap; - map::iterator itOfstream = filehandles[itBar->second][itPrimer->second].find(""); - if (itOfstream != filehandles[itBar->second][itPrimer->second].end()) { filehandles[itBar->second][itPrimer->second].erase(itOfstream); } //remove blank entry so we dont mess with .begin() above. code above assumes only 1 file name in the map + filehandles[itBar->second][itPrimer->second] = thisFilename; + temp.open(thisFilename.c_str(), ios::binary); temp.close(); } } } numFPrimers = primers.size(); numLinkers = linker.size(); numSpacers = spacer.size(); - map variables; + map variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName)); variables["[group]"] = "scrap"; noMatchFile = getOutputFileName("sff",variables); m->mothurRemove(noMatchFile); numNoMatch = 0; + bool allBlank = true; for (int i = 0; i < barcodeNameVector.size(); i++) { if (barcodeNameVector[i] != "") { @@ -1838,15 +1971,10 @@ bool SffInfoCommand::readOligos(string oligoFile){ filehandlesHeaders.resize(filehandles.size()); numSplitReads.resize(filehandles.size()); - for (int i = 0; i < filehandles.size(); i++) { - numSplitReads[i].resize(filehandles[i].size(), 0); + for (int i = 0; i < filehandles.size(); i++) { + numSplitReads[i].resize(filehandles[i].size(), 0); for (int j = 0; j < filehandles[i].size(); j++) { - ofstream* temp = new ofstream; - map myMap; - string thisHeader = (filehandles[i][j].begin())->first+"headers"; - myMap[thisHeader] = temp; - m->openOutputFileBinary(thisHeader, *(temp)); - filehandlesHeaders[i].push_back(myMap); + filehandlesHeaders[i].push_back(filehandles[i][j]+"headers"); } } @@ -1864,6 +1992,68 @@ bool SffInfoCommand::readOligos(string oligoFile){ exit(1); } } +//*************************************************************************************************************** + +bool SffInfoCommand::readGroup(string oligoFile){ + try { + filehandles.clear(); + numSplitReads.clear(); + filehandlesHeaders.clear(); + barcodes.clear(); + + groupMap = new GroupMap(); + groupMap->readMap(oligoFile); + + //like barcodeNameVector - no primer names + vector groups = groupMap->getNamesOfGroups(); + + filehandles.resize(groups.size()); + for (int i = 0; i < filehandles.size(); i++) { + for (int j = 0; j < 1; j++) { + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName)); + variables["[group]"] = groups[i]; + string thisFilename = getOutputFileName("sff",variables); + outputNames.push_back(thisFilename); + outputTypes["sff"].push_back(thisFilename); + + ofstream temp; + m->openOutputFileBinary(thisFilename, temp); temp.close(); + filehandles[i].push_back(thisFilename); + barcodes[groups[i]] = i; + } + } + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName)); + variables["[group]"] = "scrap"; + noMatchFile = getOutputFileName("sff",variables); + m->mothurRemove(noMatchFile); + numNoMatch = 0; + + + filehandlesHeaders.resize(groups.size()); + numSplitReads.resize(filehandles.size()); + for (int i = 0; i < filehandles.size(); i++) { + numSplitReads[i].resize(filehandles[i].size(), 0); + for (int j = 0; j < filehandles[i].size(); j++) { + ofstream temp ; + string thisHeader = filehandles[i][j]+"headers"; + m->openOutputFileBinary(thisHeader, temp); temp.close(); + filehandlesHeaders[i].push_back(thisHeader); + } + } + + return true; + + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readGroup"); + exit(1); + } +} + //********************************************************************/ string SffInfoCommand::reverseOligo(string oligo){ try { diff --git a/sffinfocommand.h b/sffinfocommand.h index 541c8ba..1909e2d 100644 --- a/sffinfocommand.h +++ b/sffinfocommand.h @@ -11,7 +11,7 @@ */ #include "command.hpp" - +#include "groupmap.h" /**********************************************************/ struct CommonHeader { @@ -79,17 +79,18 @@ public: void help() { m->mothurOut(getHelpString()); } private: - string sffFilename, sfftxtFilename, outputDir, accnosName, currentFileName, oligosfile, noMatchFile; - vector filenames, outputNames, accnosFileNames, oligosFileNames; - bool abort, fasta, qual, trim, flow, sfftxt, hasAccnos, hasOligos; + string sffFilename, sfftxtFilename, outputDir, accnosName, currentFileName, oligosfile, noMatchFile, groupfile; + vector filenames, outputNames, accnosFileNames, oligosFileNames, groupFileNames; + bool abort, fasta, qual, trim, flow, sfftxt, hasAccnos, hasOligos, hasGroup; int mycount, split, numFPrimers, numLinkers, numSpacers, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, numNoMatch; set seqNames; map barcodes; map primers; + GroupMap* groupMap; vector linker, spacer, primerNameVector, barcodeNameVector, revPrimer; vector > numSplitReads; - vector > > filehandles; - vector > > filehandlesHeaders; + vector > filehandles; + vector > filehandlesHeaders; //extract sff file functions int extractSffInfo(string, string, string); @@ -98,6 +99,7 @@ private: bool readSeqData(ifstream&, seqRead&, int, Header&); int decodeName(string&, string&, string&, string); bool readOligos(string oligosFile); + bool readGroup(string oligosFile); int printCommonHeader(ofstream&, CommonHeader&); int printHeader(ofstream&, Header&); @@ -110,6 +112,7 @@ private: bool sanityCheck(Header&, seqRead&); int adjustCommonHeader(CommonHeader); int findGroup(Header header, seqRead read, int& barcode, int& primer); + int findGroup(Header header, seqRead read, int& barcode, int& primer, string); string reverseOligo(string oligo); //parsesfftxt file functions diff --git a/sracommand.cpp b/sracommand.cpp index c291ddd..65ccda4 100644 --- a/sracommand.cpp +++ b/sracommand.cpp @@ -7,14 +7,25 @@ // #include "sracommand.h" +#include "sffinfocommand.h" +#include "parsefastaqcommand.h" //********************************************************************************************************************** vector SRACommand::setParameters(){ try { - CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none","sra",false,false); parameters.push_back(psff); - CommandParameter pfastqfile("fastqfile", "InputTypes", "", "", "none", "none", "none","sra",false,false); parameters.push_back(pfastqfile); + CommandParameter psff("sff", "InputTypes", "", "", "sffFastQFile", "sffFastQFile", "none","sra",false,false); parameters.push_back(psff); + CommandParameter pgroup("group", "InputTypes", "", "", "groupOligos", "none", "none","sra",false,false); parameters.push_back(pgroup); + CommandParameter poligos("oligos", "InputTypes", "", "", "groupOligos", "none", "none","sra",false,false); parameters.push_back(poligos); + CommandParameter pfile("file", "InputTypes", "", "", "sffFastQFile", "sffFastQFile", "none","sra",false,false); parameters.push_back(pfile); + CommandParameter pfastq("fastq", "InputTypes", "", "", "sffFastQFile", "sffFastQFile", "none","sra",false,false); parameters.push_back(pfastq); //choose only one multiple options CommandParameter pplatform("platform", "Multiple", "454-???-???", "454", "", "", "","",false,false); parameters.push_back(pplatform); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ppdiffs); + CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pbdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); + //every command must have inputdir and outputdir. This allows mothur users to redirect input and output files. CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -35,6 +46,12 @@ string SRACommand::getHelpString(){ helpString += "The sra command creates a sequence read archive from sff or fastq files.\n"; helpString += "The sra command parameters are: sff, fastqfiles, oligos, platform....\n"; helpString += "The sffiles parameter is used to provide a file containing a \n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; + helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; + helpString += "The new command should be in the following format: \n"; helpString += "new(...)\n"; return helpString; @@ -102,34 +119,95 @@ SRACommand::SRACommand(string option) { else { string path; - it = parameters.find("sfffiles"); + it = parameters.find("sff"); //user has given a template file if(it != parameters.end()){ path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["sfffiles"] = inputDir + it->second; } + if (path == "") { parameters["sff"] = inputDir + it->second; } } - it = parameters.find("fastqfiles"); + it = parameters.find("fastq"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fastq"] = inputDir + it->second; } + } + + it = parameters.find("file"); //user has given a template file if(it != parameters.end()){ path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["fastqfiles"] = inputDir + it->second; } + if (path == "") { parameters["file"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + it = parameters.find("oligos"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["oligos"] = inputDir + it->second; } } } //check for parameters - fastqfiles = validParameter.validFile(parameters, "fastqfiles", true); - if (fastqfiles == "not open") { fastqfiles = ""; abort = true; } - else if (fastqfiles == "not found") { fastqfiles = ""; } + fastqfile = validParameter.validFile(parameters, "fastq", true); + if (fastqfile == "not open") { fastqfile = ""; abort = true; } + else if (fastqfile == "not found") { fastqfile = ""; } - sfffiles = validParameter.validFile(parameters, "sfffiles", true); - if (sfffiles == "not open") { sfffiles = ""; abort = true; } - else if (sfffiles == "not found") { sfffiles = ""; } + sfffile = validParameter.validFile(parameters, "sff", true); + if (sfffile == "not open") { sfffile = ""; abort = true; } + else if (sfffile == "not found") { sfffile = ""; } + + file = validParameter.validFile(parameters, "file", true); + if (file == "not open") { file = ""; abort = true; } + else if (file == "not found") { file = ""; } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { groupfile = ""; abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + else { m->setGroupFile(groupfile); } + + oligosfile = validParameter.validFile(parameters, "oligos", true); + if (oligosfile == "not found") { oligosfile = ""; } + else if(oligosfile == "not open") { abort = true; } + else { m->setOligosFile(oligosfile); } + + + file = validParameter.validFile(parameters, "file", true); + if (file == "not open") { file = ""; abort = true; } + else if (file == "not found") { file = ""; } - if ((fastqfiles == "") && (sfffiles == "")) { - m->mothurOut("No valid current files. You must provide a sfffiles or fastqfiles file before you can use the sra command."); m->mothurOutEndLine(); abort = true; + if ((fastqfile == "") && (sfffile == "") && (sfffile == "")) { + m->mothurOut("[ERROR]: You must provide a file, sff file or fastq file before you can use the sra command."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (oligosfile != "")) { + m->mothurOut("[ERROR]: You may not use a group file and an oligos file, only one."); m->mothurOutEndLine(); abort = true; + } + + if ((fastqfile != "") || (sfffile != "")) { + if ((groupfile == "") && (oligosfile == "")) { + oligosfile = m->getOligosFile(); + if (oligosfile != "") { m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter."); m->mothurOutEndLine(); } + else { + groupfile = m->getGroupFile(); + if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("[ERROR]: You must provide groupfile or oligos file if splitting a fastq or sff file."); m->mothurOutEndLine(); abort = true; + } + } + } } //use only one Mutliple type @@ -139,6 +217,25 @@ SRACommand::SRACommand(string option) { if ((platform == "454") || (platform == "????") || (platform == "????") || (platform == "????")) { } else { m->mothurOut("Not a valid platform option. Valid platform options are 454, ...."); m->mothurOutEndLine(); abort = true; } + + string temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found"){ temp = "0"; } + m->mothurConvert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found"){ temp = "0"; } + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } + + } @@ -149,13 +246,20 @@ SRACommand::SRACommand(string option) { } } //********************************************************************************************************************** - int SRACommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - + //parse files + vector filesBySample; + + if (file != "") { readFile(filesBySample); } + else if (sfffile != "") { parseSffFile(filesBySample); } + else if (fastqfile != "") { parseFastqFile(filesBySample); } + + + //output files created by command m->mothurOutEndLine(); @@ -171,5 +275,96 @@ int SRACommand::execute(){ } } //********************************************************************************************************************** +int SRACommand::readFile(vector& files){ + try { + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "readFile"); + exit(1); + } +} +//********************************************************************************************************************** +int SRACommand::parseSffFile(vector& files){ + try { + //run sffinfo to parse sff file into individual sampled sff files + string commandString = "sff=" + sfffile; + if (groupfile != "") { commandString += ", group=" + groupfile; } + else if (oligosfile != "") { + commandString += ", oligos=" + oligosfile; + //add in pdiffs, bdiffs, ldiffs, sdiffs, tdiffs + if (pdiffs != 0) { commandString += ", pdiffs=" + toString(pdiffs); } + if (bdiffs != 0) { commandString += ", bdiffs=" + toString(bdiffs); } + if (ldiffs != 0) { commandString += ", ldiffs=" + toString(ldiffs); } + if (sdiffs != 0) { commandString += ", sdiffs=" + toString(sdiffs); } + if (tdiffs != 0) { commandString += ", tdiffs=" + toString(tdiffs); } + } + m->mothurOutEndLine(); + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: sffinfo(" + commandString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* sffinfoCommand = new SffInfoCommand(commandString); + sffinfoCommand->execute(); + + map > filenames = sffinfoCommand->getOutputFiles(); + map >::iterator it = filenames.find("sff"); + if (it != filenames.end()) { files = it->second; } + else { m->control_pressed = true; } // error in sffinfo + + delete sffinfoCommand; + m->mothurCalling = false; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "readFile"); + exit(1); + } +} + +//********************************************************************************************************************** +int SRACommand::parseFastqFile(vector& files){ + try { + + //run sffinfo to parse sff file into individual sampled sff files + string commandString = "fastq=" + fastqfile; + if (groupfile != "") { commandString += ", group=" + groupfile; } + else if (oligosfile != "") { + commandString += ", oligos=" + oligosfile; + //add in pdiffs, bdiffs, ldiffs, sdiffs, tdiffs + if (pdiffs != 0) { commandString += ", pdiffs=" + toString(pdiffs); } + if (bdiffs != 0) { commandString += ", bdiffs=" + toString(bdiffs); } + if (ldiffs != 0) { commandString += ", ldiffs=" + toString(ldiffs); } + if (sdiffs != 0) { commandString += ", sdiffs=" + toString(sdiffs); } + if (tdiffs != 0) { commandString += ", tdiffs=" + toString(tdiffs); } + } + m->mothurOutEndLine(); + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: fastq.info(" + commandString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* fastqinfoCommand = new ParseFastaQCommand(commandString); + fastqinfoCommand->execute(); + + map > filenames = fastqinfoCommand->getOutputFiles(); + map >::iterator it = filenames.find("fastq"); + if (it != filenames.end()) { files = it->second; } + else { m->control_pressed = true; } // error in sffinfo + + delete fastqinfoCommand; + m->mothurCalling = false; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "readFile"); + exit(1); + } +} +//********************************************************************************************************************** diff --git a/sracommand.h b/sracommand.h index 0f0a945..7bf764b 100644 --- a/sracommand.h +++ b/sracommand.h @@ -35,8 +35,14 @@ public: private: bool abort; - string sfffiles, fastqfiles, platform, outputDir; + int tdiffs, bdiffs, pdiffs, sdiffs, ldiffs; + string sfffile, fastqfile, platform, outputDir, groupfile, file, oligosfile; vector outputNames; + + int readFile(vector&); + int parseSffFile(vector&); + int parseFastqFile(vector&); + }; /**************************************************************************************************/ -- 2.39.2