X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=seqerrorcommand.cpp;h=5879241372dd3b60b34683683a0c1edbcbd64d57;hp=007ac93df67d1e93371abd1898c9cce9c0823233;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=f5ef644ce76074de08b2bbb64097619b2b16d60d diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 007ac93..5879241 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -11,21 +11,27 @@ #include "reportfile.h" #include "qualityscores.h" #include "refchimeratest.h" +#include "myPerseus.h" #include "filterseqscommand.h" + //********************************************************************************************************************** + vector SeqErrorCommand::setParameters(){ try { - CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pquery); - CommandParameter preference("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(preference); - CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(pqfile); - CommandParameter preport("report", "InputTypes", "", "", "none", "none", "QualReport",false,false); parameters.push_back(preport); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pignorechimeras); - CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pthreshold); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pquery("fasta", "InputTypes", "", "", "none", "none", "none","errorType",false,true,true); parameters.push_back(pquery); + CommandParameter preference("reference", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(preference); + CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "QualReport","",false,false); parameters.push_back(pqfile); + CommandParameter preport("report", "InputTypes", "", "", "none", "none", "QualReport","",false,false); parameters.push_back(preport); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pignorechimeras); + CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pthreshold); + CommandParameter paligned("aligned", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(paligned); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -36,11 +42,23 @@ vector SeqErrorCommand::setParameters(){ exit(1); } } + //********************************************************************************************************************** + string SeqErrorCommand::getHelpString(){ try { string helpString = ""; helpString += "The seq.error command reads a query alignment file and a reference alignment file and creates .....\n"; + helpString += "The fasta parameter...\n"; + helpString += "The reference parameter...\n"; + helpString += "The qfile parameter...\n"; + helpString += "The report parameter...\n"; + helpString += "The name parameter allows you to provide a name file associated with the fasta file.\n"; + helpString += "The count parameter allows you to provide a count file associated with the fasta file.\n"; + helpString += "The ignorechimeras parameter...\n"; + helpString += "The threshold parameter...\n"; + helpString += "The processors parameter...\n"; + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "Example seq.error(...).\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; helpString += "For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n"; @@ -51,32 +69,66 @@ string SeqErrorCommand::getHelpString(){ exit(1); } } + +//********************************************************************************************************************** + +string SeqErrorCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "errorsummary") { pattern = "[filename],error.summary"; } + else if (type == "errorseq") { pattern = "[filename],error.seq"; } + else if (type == "errorquality") { pattern = "[filename],error.quality"; } + else if (type == "errorqualforward") { pattern = "[filename],error.qual.forward"; } + else if (type == "errorqualreverse") { pattern = "[filename],error.qual.reverse"; } + else if (type == "errorforward") { pattern = "[filename],error.seq.forward"; } + else if (type == "errorreverse") { pattern = "[filename],error.seq.reverse"; } + else if (type == "errorcount") { pattern = "[filename],error.count"; } + else if (type == "errormatrix") { pattern = "[filename],error.matrix"; } + else if (type == "errorchimera") { pattern = "[filename],error.chimera"; } + else if (type == "errorref-query") { pattern = "[filename],error.ref-query"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "SeqErrorCommand", "getOutputPattern"); + exit(1); + } +} + //********************************************************************************************************************** + SeqErrorCommand::SeqErrorCommand(){ try { abort = true; calledHelp = true; + setParameters(); vector tempOutNames; - outputTypes["error.summary"] = tempOutNames; - outputTypes["error.seq"] = tempOutNames; - outputTypes["error.quality"] = tempOutNames; - outputTypes["error.qual.forward"] = tempOutNames; - outputTypes["error.qual.reverse"] = tempOutNames; - outputTypes["error.forward"] = tempOutNames; - outputTypes["error.reverse"] = tempOutNames; - outputTypes["error.count"] = tempOutNames; - outputTypes["error.matrix"] = tempOutNames; + outputTypes["errorsummary"] = tempOutNames; + outputTypes["errorseq"] = tempOutNames; + outputTypes["errorquality"] = tempOutNames; + outputTypes["errorqualforward"] = tempOutNames; + outputTypes["errorqualreverse"] = tempOutNames; + outputTypes["errorforward"] = tempOutNames; + outputTypes["errorreverse"] = tempOutNames; + outputTypes["errorcount"] = tempOutNames; + outputTypes["errormatrix"] = tempOutNames; + outputTypes["errorchimera"] = tempOutNames; + outputTypes["errorref-query"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "SeqErrorCommand", "SeqErrorCommand"); exit(1); } } + //*************************************************************************************************************** SeqErrorCommand::SeqErrorCommand(string option) { try { abort = false; calledHelp = false; + rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -99,15 +151,17 @@ SeqErrorCommand::SeqErrorCommand(string option) { //initialize outputTypes vector tempOutNames; - outputTypes["error.summary"] = tempOutNames; - outputTypes["error.seq"] = tempOutNames; - outputTypes["error.quality"] = tempOutNames; - outputTypes["error.qual.forward"] = tempOutNames; - outputTypes["error.qual.reverse"] = tempOutNames; - outputTypes["error.forward"] = tempOutNames; - outputTypes["error.reverse"] = tempOutNames; - outputTypes["error.count"] = tempOutNames; - outputTypes["error.matrix"] = tempOutNames; + outputTypes["errorsummary"] = tempOutNames; + outputTypes["errorseq"] = tempOutNames; + outputTypes["errorquality"] = tempOutNames; + outputTypes["errorqualforward"] = tempOutNames; + outputTypes["errorqualreverse"] = tempOutNames; + outputTypes["errorforward"] = tempOutNames; + outputTypes["errorreverse"] = tempOutNames; + outputTypes["errorcount"] = tempOutNames; + outputTypes["errormatrix"] = tempOutNames; + outputTypes["errorchimera"] = tempOutNames; + outputTypes["errorref-query"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter @@ -138,6 +192,14 @@ SeqErrorCommand::SeqErrorCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a names file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } it = parameters.find("qfile"); //user has given a quality score file @@ -163,19 +225,24 @@ SeqErrorCommand::SeqErrorCommand(string option) { if (queryFileName != "") { m->mothurOut("Using " + queryFileName + " as input file for the fasta parameter."); m->mothurOutEndLine(); } else { m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } } - else if (queryFileName == "not open") { abort = true; } + else if (queryFileName == "not open") { queryFileName = ""; abort = true; } else { m->setFastaFile(queryFileName); } referenceFileName = validParameter.validFile(parameters, "reference", true); if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; } else if (referenceFileName == "not open") { abort = true; } - //check for optional parameters namesFileName = validParameter.validFile(parameters, "name", true); if(namesFileName == "not found"){ namesFileName = ""; } else if (namesFileName == "not open") { namesFileName = ""; abort = true; } else { m->setNameFile(namesFileName); } + + //check for optional parameters + countfile = validParameter.validFile(parameters, "count", true); + if(countfile == "not found"){ countfile = ""; } + else if (countfile == "not open") { countfile = ""; abort = true; } + else { m->setCountTableFile(countfile); } qualFileName = validParameter.validFile(parameters, "qfile", true); if(qualFileName == "not found"){ qualFileName = ""; } @@ -186,32 +253,75 @@ SeqErrorCommand::SeqErrorCommand(string option) { if(reportFileName == "not found"){ reportFileName = ""; } else if (reportFileName == "not open") { reportFileName = ""; abort = true; } - if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){ - m->mothurOut("if you use either a qual file or a report file, you have to have both."); - m->mothurOutEndLine(); - abort = true; - } - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; outputDir += m->hasPath(queryFileName); //if user entered a file with a path then preserve it } + if ((countfile != "") && (namesFileName != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; } - convert(temp, threshold); + m->mothurConvert(temp, threshold); + + + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + referenceFileName = validParameter.validFile(parameters, "reference", true); + if (referenceFileName == "not found") { + //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + referenceFileName = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + abort = true; + } + }else if (referenceFileName == "not open") { abort = true; } + else { if (save) { rdb->setSavedReference(referenceFileName); } } + temp = validParameter.validFile(parameters, "ignorechimeras", false); if (temp == "not found") { temp = "T"; } ignoreChimeras = m->isTrue(temp); + temp = validParameter.validFile(parameters, "aligned", false); if (temp == "not found"){ temp = "t"; } + aligned = m->isTrue(temp); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - convert(temp, processors); + m->mothurConvert(temp, processors); substitutionMatrix.resize(6); for(int i=0;i<6;i++){ substitutionMatrix[i].resize(6,0); } + + if ((namesFileName == "") && (queryFileName != "")){ + vector files; files.push_back(queryFileName); + parser.getNameFile(files); + } + + if(aligned == true){ + if((reportFileName != "" && qualFileName == "") || (reportFileName == "" && qualFileName != "")){ + m->mothurOut("if you use either a qual file or a report file, you have to have both."); + m->mothurOutEndLine(); + abort = true; + } + } + else{ + if(reportFileName != ""){ + m->mothurOut("we are ignoring the report file if your sequences are not aligned. we will check that the sequences in your fasta and and qual file are the same length."); + m->mothurOutEndLine(); + } + } + + } } catch(exception& e) { @@ -219,6 +329,7 @@ SeqErrorCommand::SeqErrorCommand(string option) { exit(1); } } + //*************************************************************************************************************** int SeqErrorCommand::execute(){ @@ -226,26 +337,34 @@ int SeqErrorCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); - maxLength = 2000; + maxLength = 5000; totalBases = 0; totalMatches = 0; - string errorSummaryFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.summary"; - outputNames.push_back(errorSummaryFileName); outputTypes["error.summary"].push_back(errorSummaryFileName); + string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName)); + map variables; + variables["[filename]"] = fileNameRoot; + string errorSummaryFileName = getOutputFileName("errorsummary",variables); + outputNames.push_back(errorSummaryFileName); outputTypes["errorsummary"].push_back(errorSummaryFileName); - string errorSeqFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq"; - outputNames.push_back(errorSeqFileName); outputTypes["error.seq"].push_back(errorSeqFileName); + string errorSeqFileName = getOutputFileName("errorseq",variables); + outputNames.push_back(errorSeqFileName); outputTypes["errorseq"].push_back(errorSeqFileName); - string errorChimeraFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.chimera"; - outputNames.push_back(errorChimeraFileName); outputTypes["error.chimera"].push_back(errorChimeraFileName); + string errorChimeraFileName = getOutputFileName("errorchimera",variables); + outputNames.push_back(errorChimeraFileName); outputTypes["errorchimera"].push_back(errorChimeraFileName); getReferences(); //read in reference sequences - make sure there's no ambiguous bases - if(namesFileName != ""){ weights = getWeights(); } + if(namesFileName != "") { weights = getWeights(); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile, false); + weights = ct.getNameMap(); + } - vector fastaFilePos; - vector qFilePos; - vector reportFilePos; + vector fastaFilePos; + vector qFilePos; + vector reportFilePos; setLines(queryFileName, qualFileName, reportFileName, fastaFilePos, qFilePos, reportFilePos); @@ -254,34 +373,35 @@ int SeqErrorCommand::execute(){ for (int i = 0; i < (fastaFilePos.size()-1); i++) { lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)])); if (qualFileName != "") { qLines.push_back(linePair(qFilePos[i], qFilePos[(i+1)])); } - if (reportFileName != "") { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); } + if (reportFileName != "" && aligned == true) { rLines.push_back(linePair(reportFilePos[i], reportFilePos[(i+1)])); } } if(qualFileName == "") { qLines = lines; rLines = lines; } //fills with duds - + if(aligned == false){ rLines = lines; } int numSeqs = 0; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]); + }else{ numSeqs = createProcesses(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName); } #else numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]); #endif - - if(qualFileName != "" && reportFileName != ""){ + + if(qualFileName != ""){ printErrorQuality(qScoreErrorMap); printQualityFR(qualForwardMap, qualReverseMap); } printErrorFRFile(errorForward, errorReverse); - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - string errorCountFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.count"; + string errorCountFileName = getOutputFileName("errorcount",variables); ofstream errorCountFile; m->openOutputFile(errorCountFileName, errorCountFile); - outputNames.push_back(errorCountFileName); outputTypes["error.count"].push_back(errorCountFileName); + outputNames.push_back(errorCountFileName); outputTypes["errorcount"].push_back(errorCountFileName); m->mothurOut("Overall error rate:\t" + toString((double)(totalBases - totalMatches) / (double)totalBases) + "\n"); m->mothurOut("Errors\tSequences\n"); errorCountFile << "Errors\tSequences\n"; @@ -291,23 +411,32 @@ int SeqErrorCommand::execute(){ } errorCountFile.close(); - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } +// if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } printSubMatrix(); - - string megAlignmentFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.ref-query"; + + string megAlignmentFileName = getOutputFileName("errorref-query",variables); ofstream megAlignmentFile; m->openOutputFile(megAlignmentFileName, megAlignmentFile); - outputNames.push_back(megAlignmentFileName); outputTypes["error.ref-query"].push_back(megAlignmentFileName); - + outputNames.push_back(megAlignmentFileName); outputTypes["errorref-query"].push_back(megAlignmentFileName); + for(int i=0;imothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); + //set fasta file as new current fastafile + string current = ""; + itTypes = outputTypes.find("errorseq"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } @@ -320,14 +449,16 @@ int SeqErrorCommand::execute(){ exit(1); } } + //********************************************************************************************************************** + int SeqErrorCommand::createProcesses(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName) { try { int process = 1; processIDS.clear(); map >::iterator it; int num = 0; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //loop through and create all the processes you want while (process != processors) { @@ -448,11 +579,11 @@ int SeqErrorCommand::createProcesses(string filename, string qFileName, string r m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); m->appendFiles((summaryFileName + toString(processIDS[i]) + ".temp"), summaryFileName); - remove((summaryFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((summaryFileName + toString(processIDS[i]) + ".temp")); m->appendFiles((errorOutputFileName + toString(processIDS[i]) + ".temp"), errorOutputFileName); - remove((errorOutputFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((errorOutputFileName + toString(processIDS[i]) + ".temp")); m->appendFiles((chimeraOutputFileName + toString(processIDS[i]) + ".temp"), chimeraOutputFileName); - remove((chimeraOutputFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((chimeraOutputFileName + toString(processIDS[i]) + ".temp")); ifstream in; string tempFile = filename + toString(processIDS[i]) + ".info.temp"; @@ -535,7 +666,7 @@ int SeqErrorCommand::createProcesses(string filename, string qFileName, string r int misMatchSize; in >> misMatchSize; m->gobble(in); if (misMatchSize > misMatchCounts.size()) { misMatchCounts.resize(misMatchSize, 0); } - for (int j = 0; j < misMatchCounts.size(); j++) { + for (int j = 0; j < misMatchSize; j++) { in >> tempNum; misMatchCounts[j] += tempNum; } m->gobble(in); @@ -547,7 +678,7 @@ int SeqErrorCommand::createProcesses(string filename, string qFileName, string r } m->gobble(in); - in.close(); remove(tempFile.c_str()); + in.close(); m->mothurRemove(tempFile); } #endif @@ -558,7 +689,9 @@ int SeqErrorCommand::createProcesses(string filename, string qFileName, string r exit(1); } } + //********************************************************************************************************************** + int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, string summaryFileName, string errorOutputFileName, string chimeraOutputFileName, linePair line, linePair qline, linePair rline) { try { @@ -570,10 +703,10 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, int numSeqs = 0; map::iterator it; - qScoreErrorMap['m'].assign(41, 0); - qScoreErrorMap['s'].assign(41, 0); - qScoreErrorMap['i'].assign(41, 0); - qScoreErrorMap['a'].assign(41, 0); + qScoreErrorMap['m'].assign(101, 0); + qScoreErrorMap['s'].assign(101, 0); + qScoreErrorMap['i'].assign(101, 0); + qScoreErrorMap['a'].assign(101, 0); errorForward['m'].assign(maxLength,0); errorForward['s'].assign(maxLength,0); @@ -590,11 +723,12 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, //open inputfiles and go to beginning place for this processor ifstream queryFile; m->openInputFile(filename, queryFile); + queryFile.seekg(line.start); ifstream reportFile; ifstream qualFile; - if(qFileName != "" && rFileName != ""){ + if((qFileName != "" && rFileName != "" && aligned)){ m->openInputFile(qFileName, qualFile); qualFile.seekg(qline.start); @@ -608,15 +742,32 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, qualForwardMap.resize(maxLength); qualReverseMap.resize(maxLength); for(int i=0;iopenInputFile(qFileName, qualFile); + qualFile.seekg(qline.start); + + qualForwardMap.resize(maxLength); + qualReverseMap.resize(maxLength); + for(int i=0;iopenOutputFile(chimeraOutputFileName, outChimeraReport); - RefChimeraTest chimeraTest(referenceSeqs); - if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); } + + + RefChimeraTest chimeraTest; + + chimeraTest = RefChimeraTest(referenceSeqs, aligned); + if (line.start == 0) { chimeraTest.printHeader(outChimeraReport); } + ofstream errorSummaryFile; m->openOutputFile(summaryFileName, errorSummaryFile); @@ -625,47 +776,58 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, ofstream errorSeqFile; m->openOutputFile(errorOutputFileName, errorSeqFile); - megaAlignVector.resize(numRefs, ""); + megaAlignVector.assign(numRefs, ""); int index = 0; bool ignoreSeq = 0; bool moreSeqs = 1; while (moreSeqs) { - - if (m->control_pressed) { queryFile.close(); if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } outChimeraReport.close(); errorSummaryFile.close();errorSeqFile.close(); return 0; } - + Sequence query(queryFile); - - int numParentSeqs = chimeraTest.analyzeQuery(query.getName(), query.getAligned(), outChimeraReport); - int closestRefIndex = chimeraTest.getClosestRefIndex(); - + Sequence reference; + int numParentSeqs = -1; + int closestRefIndex = -1; + + string querySeq = query.getAligned(); + if (!aligned) { querySeq = query.getUnaligned(); } + + numParentSeqs = chimeraTest.analyzeQuery(query.getName(), querySeq, outChimeraReport); + + closestRefIndex = chimeraTest.getClosestRefIndex(); + + reference = referenceSeqs[closestRefIndex]; + + reference.setAligned(chimeraTest.getClosestRefAlignment()); + query.setAligned(chimeraTest.getQueryAlignment()); + if(numParentSeqs > 1 && ignoreChimeras == 1) { ignoreSeq = 1; } - else { ignoreSeq = 0; } - - Compare minCompare = getErrors(query, referenceSeqs[closestRefIndex]); + else { ignoreSeq = 0; } + + Compare minCompare; + + getErrors(query, reference, minCompare); - if(namesFileName != ""){ + if((namesFileName != "") || (countfile != "")){ it = weights.find(query.getName()); minCompare.weight = it->second; } else{ minCompare.weight = 1; } + printErrorData(minCompare, numParentSeqs, errorSummaryFile, errorSeqFile); if(!ignoreSeq){ - for(int i=0;i maxMismatch){ @@ -700,22 +874,25 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, index++; - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = queryFile.tellg(); + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + unsigned long long pos = queryFile.tellg(); if ((pos == -1) || (pos >= line.end)) { break; } #else if (queryFile.eof()) { break; } #endif - if(index % 100 == 0){ m->mothurOut(toString(index) + '\n'); } + if(index % 100 == 0){ m->mothurOutJustToScreen(toString(index)+"\n"); } } queryFile.close(); - if(qFileName != "" && rFileName != ""){ reportFile.close(); qualFile.close(); } - errorSummaryFile.close(); + outChimeraReport.close(); + errorSummaryFile.close(); errorSeqFile.close(); - + + if(qFileName != "" && rFileName != "") { reportFile.close(); qualFile.close(); } + else if(qFileName != "" && aligned == false){ qualFile.close(); } + //report progress - if(index % 100 != 0){ m->mothurOut(toString(index) + '\n'); } + m->mothurOutJustToScreen(toString(index)+"\n"); return index; } @@ -724,45 +901,80 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, exit(1); } } + //*************************************************************************************************************** void SeqErrorCommand::getReferences(){ try { - - ifstream referenceFile; - m->openInputFile(referenceFileName, referenceFile); - int numAmbigSeqs = 0; int maxStartPos = 0; int minEndPos = 100000; - while(referenceFile){ - Sequence currentSeq(referenceFile); - int numAmbigs = currentSeq.getAmbigBases(); - if(numAmbigs > 0){ numAmbigSeqs++; } + if (referenceFileName == "saved") { + int start = time(NULL); + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); -// int startPos = currentSeq.getStartPos(); -// if(startPos > maxStartPos) { maxStartPos = startPos; } -// -// int endPos = currentSeq.getEndPos(); -// if(endPos < minEndPos) { minEndPos = endPos; } - referenceSeqs.push_back(currentSeq); + for (int i = 0; i < rdb->referenceSeqs.size(); i++) { + int numAmbigs = rdb->referenceSeqs[i].getAmbigBases(); + if(numAmbigs > 0){ numAmbigSeqs++; } + + // int startPos = rdb->referenceSeqs[i].getStartPos(); + // if(startPos > maxStartPos) { maxStartPos = startPos; } + // + // int endPos = rdb->referenceSeqs[i].getEndPos(); + // if(endPos < minEndPos) { minEndPos = endPos; } + if (rdb->referenceSeqs[i].getNumBases() == 0) { + m->mothurOut("[WARNING]: " + rdb->referenceSeqs[i].getName() + " is blank, ignoring.");m->mothurOutEndLine(); + }else { + referenceSeqs.push_back(rdb->referenceSeqs[i]); + } - m->gobble(referenceFile); + } + referenceFileName = rdb->getSavedReference(); + + m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); + + }else { + int start = time(NULL); + + ifstream referenceFile; + m->openInputFile(referenceFileName, referenceFile); + + while(referenceFile){ + Sequence currentSeq(referenceFile); + int numAmbigs = currentSeq.getAmbigBases(); + if(numAmbigs > 0){ numAmbigSeqs++; } + + // int startPos = currentSeq.getStartPos(); + // if(startPos > maxStartPos) { maxStartPos = startPos; } + // + // int endPos = currentSeq.getEndPos(); + // if(endPos < minEndPos) { minEndPos = endPos; } + if (currentSeq.getNumBases() == 0) { + m->mothurOut("[WARNING]: " + currentSeq.getName() + " is blank, ignoring.");m->mothurOutEndLine(); + }else { + referenceSeqs.push_back(currentSeq); + if (rdb->save) { rdb->referenceSeqs.push_back(currentSeq); } + } + + m->gobble(referenceFile); + } + referenceFile.close(); + + m->mothurOut("It took " + toString(time(NULL) - start) + " to read " + toString(referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); } - referenceFile.close(); + numRefs = referenceSeqs.size(); - for(int i=0;imothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n"); - } + } } catch(exception& e) { @@ -773,7 +985,7 @@ void SeqErrorCommand::getReferences(){ //*************************************************************************************************************** -Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){ +int SeqErrorCommand::getErrors(Sequence query, Sequence reference, Compare& errors){ try { if(query.getAlignLength() != reference.getAlignLength()){ m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n"); @@ -784,10 +996,11 @@ Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){ string r = reference.getAligned(); int started = 0; - Compare errors; - + //Compare errors; + + errors.sequence = ""; for(int i=0;ierrorOut(e, "SeqErrorCommand", "getErrors"); @@ -886,10 +1099,12 @@ map SeqErrorCommand::getWeights(){ nameCountMap[seqName] = m->getNumNames(redundantSeqs); m->gobble(nameFile); } + + nameFile.close(); + return nameCountMap; } - //*************************************************************************************************************** void SeqErrorCommand::printErrorHeader(ofstream& errorSummaryFile){ @@ -978,10 +1193,13 @@ void SeqErrorCommand::printErrorData(Compare error, int numParentSeqs, ofstream& void SeqErrorCommand::printSubMatrix(){ try { - string subMatrixFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.matrix"; + string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName)); + map variables; + variables["[filename]"] = fileNameRoot; + string subMatrixFileName = getOutputFileName("errormatrix",variables); ofstream subMatrixFile; m->openOutputFile(subMatrixFileName, subMatrixFile); - outputNames.push_back(subMatrixFileName); outputTypes["error.matrix"].push_back(subMatrixFileName); + outputNames.push_back(subMatrixFileName); outputTypes["errormatrix"].push_back(subMatrixFileName); vector bases(6); bases[0] = "A"; bases[1] = "T"; @@ -1020,14 +1238,18 @@ void SeqErrorCommand::printSubMatrix(){ exit(1); } } + //*************************************************************************************************************** void SeqErrorCommand::printErrorFRFile(map > errorForward, map > errorReverse){ try{ - string errorForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.forward"; + string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName)); + map variables; + variables["[filename]"] = fileNameRoot; + string errorForwardFileName = getOutputFileName("errorforward",variables); ofstream errorForwardFile; m->openOutputFile(errorForwardFileName, errorForwardFile); - outputNames.push_back(errorForwardFileName); outputTypes["error.forward"].push_back(errorForwardFileName); + outputNames.push_back(errorForwardFileName); outputTypes["errorforward"].push_back(errorForwardFileName); errorForwardFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl; for(int i=0;i > errorForward, map } errorForwardFile.close(); - string errorReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.seq.reverse"; + string errorReverseFileName = getOutputFileName("errorreverse",variables); ofstream errorReverseFile; m->openOutputFile(errorReverseFileName, errorReverseFile); - outputNames.push_back(errorReverseFileName); outputTypes["error.reverse"].push_back(errorReverseFileName); + outputNames.push_back(errorReverseFileName); outputTypes["errorreverse"].push_back(errorReverseFileName); errorReverseFile << "position\ttotalseqs\tmatch\tsubstitution\tinsertion\tdeletion\tambiguous" << endl; for(int i=0;i > errorForward, map void SeqErrorCommand::printErrorQuality(map > qScoreErrorMap){ try{ - - string errorQualityFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.quality"; + string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName)); + map variables; + variables["[filename]"] = fileNameRoot; + string errorQualityFileName = getOutputFileName("errorquality",variables); ofstream errorQualityFile; m->openOutputFile(errorQualityFileName, errorQualityFile); - outputNames.push_back(errorQualityFileName); outputTypes["error.quality"].push_back(errorQualityFileName); + outputNames.push_back(errorQualityFileName); outputTypes["errorquality"].push_back(errorQualityFileName); errorQualityFile << "qscore\tmatches\tsubstitutions\tinsertions\tambiguous" << endl; - for(int i=0;i<41;i++){ + for(int i=0;i<101;i++){ errorQualityFile << i << '\t' << qScoreErrorMap['m'][i] << '\t' << qScoreErrorMap['s'][i] << '\t' << qScoreErrorMap['i'][i] << '\t'<< qScoreErrorMap['a'][i] << endl; } errorQualityFile.close(); } catch(exception& e) { - m->errorOut(e, "SeqErrorCommand", "printErrorFRFile"); + m->errorOut(e, "SeqErrorCommand", "printErrorQuality"); exit(1); } } - //*************************************************************************************************************** void SeqErrorCommand::printQualityFR(vector > qualForwardMap, vector > qualReverseMap){ @@ -1104,11 +1327,13 @@ void SeqErrorCommand::printQualityFR(vector > qualForwardMap, vector } } } - - string qualityForwardFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.forward"; + string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(queryFileName)); + map variables; + variables["[filename]"] = fileNameRoot; + string qualityForwardFileName = getOutputFileName("errorqualforward",variables); ofstream qualityForwardFile; m->openOutputFile(qualityForwardFileName, qualityForwardFile); - outputNames.push_back(qualityForwardFileName); outputTypes["error.qual.forward"].push_back(qualityForwardFileName); + outputNames.push_back(qualityForwardFileName); outputTypes["errorqualforward"].push_back(qualityForwardFileName); for(int i=0;i > qualForwardMap, vector qualityForwardFile.close(); - string qualityReverseFileName = queryFileName.substr(0,queryFileName.find_last_of('.')) + ".error.qual.reverse"; + string qualityReverseFileName = getOutputFileName("errorqualreverse",variables); ofstream qualityReverseFile; m->openOutputFile(qualityReverseFileName, qualityReverseFile); - outputNames.push_back(qualityReverseFileName); outputTypes["error.qual.reverse"].push_back(qualityReverseFileName); + outputNames.push_back(qualityReverseFileName); outputTypes["errorqualreverse"].push_back(qualityReverseFileName); for(int i=0;i > qualForwardMap, vector qualityReverseFile.close(); } catch(exception& e) { - m->errorOut(e, "SeqErrorCommand", "printErrorFRFile"); + m->errorOut(e, "SeqErrorCommand", "printQualityFR"); exit(1); } } + /**************************************************************************************************/ -int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector& fastaFilePos, vector& qfileFilePos, vector& rfileFilePos) { +int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector& fastaFilePos, vector& qfileFilePos, vector& rfileFilePos) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //set file positions for fasta file fastaFilePos = m->divideFile(filename, processors); @@ -1185,11 +1411,13 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam string sname = ""; nameStream >> sname; sname = sname.substr(1); + + m->checkName(sname); map::iterator it = firstSeqNames.find(sname); if(it != firstSeqNames.end()) { //this is the start of a new chunk - unsigned long int pos = inQual.tellg(); + unsigned long long pos = inQual.tellg(); qfileFilePos.push_back(pos - input.length() - 1); firstSeqNames.erase(it); } @@ -1210,7 +1438,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam //get last file position of qfile FILE * pFile; - unsigned long int size; + unsigned long long size; //get num bytes in file pFile = fopen (qfilename.c_str(),"rb"); @@ -1223,65 +1451,67 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam qfileFilePos.push_back(size); - //seach for filePos of each first name in the rfile and save in rfileFilePos - string junk; - ifstream inR; - m->openInputFile(rfilename, inR); - - //read column headers - for (int i = 0; i < 16; i++) { - if (!inR.eof()) { inR >> junk; } - else { break; } - } - - while(!inR.eof()){ - - if (m->control_pressed) { inR.close(); return processors; } - - input = m->getline(inR); - - if (input.length() != 0) { - - istringstream nameStream(input); - string sname = ""; nameStream >> sname; - - map::iterator it = firstSeqNamesReport.find(sname); - - if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk - unsigned long int pos = inR.tellg(); - rfileFilePos.push_back(pos - input.length() - 1); - firstSeqNamesReport.erase(it); - } - } - - if (firstSeqNamesReport.size() == 0) { break; } - m->gobble(inR); - } - inR.close(); - - if (firstSeqNamesReport.size() != 0) { - for (map::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) { - m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine(); - } - m->control_pressed = true; - return processors; - } - - //get last file position of qfile - FILE * rFile; - unsigned long int sizeR; - - //get num bytes in file - rFile = fopen (rfilename.c_str(),"rb"); - if (rFile==NULL) perror ("Error opening file"); - else{ - fseek (rFile, 0, SEEK_END); - sizeR=ftell (rFile); - fclose (rFile); + if(aligned){ + //seach for filePos of each first name in the rfile and save in rfileFilePos + string junk; + ifstream inR; + + m->openInputFile(rfilename, inR); + + //read column headers + for (int i = 0; i < 16; i++) { + if (!inR.eof()) { inR >> junk; } + else { break; } + } + + while(!inR.eof()){ + + input = m->getline(inR); + + if (input.length() != 0) { + + istringstream nameStream(input); + string sname = ""; nameStream >> sname; + + m->checkName(sname); + + map::iterator it = firstSeqNamesReport.find(sname); + + if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk + unsigned long long pos = inR.tellg(); + rfileFilePos.push_back(pos - input.length() - 1); + firstSeqNamesReport.erase(it); + } + } + + if (firstSeqNamesReport.size() == 0) { break; } + m->gobble(inR); + } + inR.close(); + + if (firstSeqNamesReport.size() != 0) { + for (map::iterator it = firstSeqNamesReport.begin(); it != firstSeqNamesReport.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your report file, aborting."); m->mothurOutEndLine(); + } + m->control_pressed = true; + return processors; + } + + //get last file position of qfile + FILE * rFile; + unsigned long long sizeR; + + //get num bytes in file + rFile = fopen (rfilename.c_str(),"rb"); + if (rFile==NULL) perror ("Error opening file"); + else{ + fseek (rFile, 0, SEEK_END); + sizeR=ftell (rFile); + fclose (rFile); + } + + rfileFilePos.push_back(sizeR); } - - rfileFilePos.push_back(sizeR); - return processors; #else @@ -1289,7 +1519,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam fastaFilePos.push_back(0); qfileFilePos.push_back(0); //get last file position of fastafile FILE * pFile; - unsigned long int size; + unsigned long long size; //get num bytes in file pFile = fopen (filename.c_str(),"rb"); @@ -1323,4 +1553,5 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam exit(1); } } + //***************************************************************************************************************