From eb71e28b7b7afd82540f4a8f0bac9429c5b9d713 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Wed, 6 Feb 2013 11:44:10 -0500 Subject: [PATCH] added *.count.summary file to count.groups command. added deltaq and insert parameters to make.contigs and changed contigs.report file. added illumina1.8+ format to fast commands. added contigsreport, align report and summary files to screen.seqs to allow files to be screened using their inputs. added minoverlap, ostart, oend, mismatches, maxn, minscore,maxinsert, minsim parameters to screen.seqs. added optimization types of minoverlap, ostart, oend, mismatches, maxn, minscore,maxinsert, minsim to screen.seqs. added getNumNs function to sequence class. working on Windows \\ issue. --- clustersplitcommand.cpp | 2 + countgroupscommand.cpp | 66 +- countgroupscommand.h | 3 +- makecontigscommand.cpp | 268 ++---- makecontigscommand.h | 119 +-- makefastqcommand.cpp | 8 +- makefile | 4 +- mothurout.cpp | 111 ++- mothurout.h | 3 +- parsefastaqcommand.cpp | 15 +- primerdesigncommand.cpp | 2 +- primerdesigncommand.h | 2 +- screenseqscommand.cpp | 1885 +++++++++++++++++++++++++++++++-------- screenseqscommand.h | 267 +++++- sequence.cpp | 9 + sequence.hpp | 1 + 16 files changed, 2046 insertions(+), 719 deletions(-) diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 0e6798f..87d26ce 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -439,6 +439,8 @@ int ClusterSplitCommand::execute(){ vector< map > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size delete split; + if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); } + //output a merged distance file if (splitmethod == "fasta") { createMergedDistanceFile(distName); } diff --git a/countgroupscommand.cpp b/countgroupscommand.cpp index 215c2e5..6da0774 100644 --- a/countgroupscommand.cpp +++ b/countgroupscommand.cpp @@ -14,9 +14,9 @@ //********************************************************************************************************************** vector CountGroupsCommand::setParameters(){ try { - CommandParameter pshared("shared", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","",false,false,true); parameters.push_back(pshared); - CommandParameter pgroup("group", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","",false,false,true); parameters.push_back(pgroup); - CommandParameter pcount("count", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pshared("shared", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","summary",false,false,true); parameters.push_back(pshared); + CommandParameter pgroup("group", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","summary",false,false,true); parameters.push_back(pgroup); + CommandParameter pcount("count", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none","summary",false,false,true); parameters.push_back(pcount); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos); CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); @@ -32,6 +32,21 @@ vector CountGroupsCommand::setParameters(){ } } //********************************************************************************************************************** +string CountGroupsCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "summary") { pattern = "[filename],count.summary"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** string CountGroupsCommand::getHelpString(){ try { string helpString = ""; @@ -55,6 +70,8 @@ CountGroupsCommand::CountGroupsCommand(){ try { abort = true; calledHelp = true; setParameters(); + vector tempOutNames; + outputTypes["summary"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "CountGroupsCommand", "CountGroupsCommand"); @@ -125,6 +142,8 @@ CountGroupsCommand::CountGroupsCommand(string option) { } } + vector tempOutNames; + outputTypes["summary"] = tempOutNames; //check for required parameters accnosfile = validParameter.validFile(parameters, "accnos", true); @@ -200,6 +219,15 @@ int CountGroupsCommand::execute(){ if (accnosfile != "") { m->readAccnos(accnosfile, Groups); m->setGroups(Groups); } if (groupfile != "") { + map variables; + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)); + string outputFileName = getOutputFileName("summary", variables); + outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName); + ofstream out; + m->openOutputFile(outputFileName, out); + GroupMap groupMap(groupfile); groupMap.readMap(); @@ -214,14 +242,24 @@ int CountGroupsCommand::execute(){ int num = groupMap.getNumSeqs(Groups[i]); total += num; m->mothurOut(Groups[i] + " contains " + toString(num) + "."); m->mothurOutEndLine(); + out << Groups[i] << '\t' << num << endl; } - + out.close(); m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine(); } if (m->control_pressed) { return 0; } if (countfile != "") { + map variables; + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(countfile)); + string outputFileName = getOutputFileName("summary", variables); + outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName); + ofstream out; + m->openOutputFile(outputFileName, out); + CountTable ct; ct.readTable(countfile); @@ -236,7 +274,9 @@ int CountGroupsCommand::execute(){ int num = ct.getGroupCount(Groups[i]); total += num; m->mothurOut(Groups[i] + " contains " + toString(num) + "."); m->mothurOutEndLine(); + out << Groups[i] << '\t' << num << endl; } + out.close(); m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine(); } @@ -247,17 +287,33 @@ int CountGroupsCommand::execute(){ InputData input(sharedfile, "sharedfile"); vector lookup = input.getSharedRAbundVectors(); + map variables; + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(countfile)); + string outputFileName = getOutputFileName("summary", variables); + outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName); + ofstream out; + m->openOutputFile(outputFileName, out); + int total = 0; for (int i = 0; i < lookup.size(); i++) { int num = lookup[i]->getNumSeqs(); total += num; m->mothurOut(lookup[i]->getGroup() + " contains " + toString(num) + "."); m->mothurOutEndLine(); delete lookup[i]; + out << lookup[i]->getGroup() << '\t' << num << endl; } + out.close(); m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine(); } - + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + return 0; } diff --git a/countgroupscommand.h b/countgroupscommand.h index 1ab83c8..cd226a3 100644 --- a/countgroupscommand.h +++ b/countgroupscommand.h @@ -24,7 +24,7 @@ public: string getCommandName() { return "count.groups"; } string getCommandCategory() { return "Sequence Processing"; } string getHelpString(); - string getOutputPattern(string){ return ""; } + string getOutputPattern(string); string getCitation() { return "http://www.mothur.org/wiki/Count.groups"; } string getDescription() { return "counts the number of sequences in each group"; } @@ -36,6 +36,7 @@ private: string sharedfile, groupfile, countfile, outputDir, groups, accnosfile; bool abort; vector Groups; + vector outputNames; }; #endif diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index 74133bb..3474c57 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -15,8 +15,8 @@ vector MakeContigsCommand::setParameters(){ CommandParameter prfastq("rfastq", "InputTypes", "", "", "none", "none", "fastqGroup","fasta-qfile",false,false,true); parameters.push_back(prfastq); CommandParameter pfasta("ffasta", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "fastaGroup","fasta",false,false,true); parameters.push_back(pfasta); CommandParameter prfasta("rfasta", "InputTypes", "", "", "none", "none", "none","fastaGroup",false,false,true); parameters.push_back(prfasta); - CommandParameter pfqual("fqfile", "InputTypes", "", "", "none", "none", "qfileGroup","qfile",false,false,true); parameters.push_back(pfqual); - CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","qfile",false,false,true); parameters.push_back(prqual); + CommandParameter pfqual("fqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(pfqual); + CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(prqual); CommandParameter pfile("file", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "none","fasta-qfile",false,false,true); parameters.push_back(pfile); CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); @@ -31,9 +31,10 @@ vector MakeContigsCommand::setParameters(){ CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch); CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen); CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend); - CommandParameter pthreshold("threshold", "Number", "", "40", "", "", "","",false,false); parameters.push_back(pthreshold); + CommandParameter pthreshold("insert", "Number", "", "25", "", "", "","",false,false); parameters.push_back(pthreshold); + CommandParameter pdeltaq("deltaq", "Number", "", "6", "", "", "","",false,false); parameters.push_back(pdeltaq); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); - CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat); + CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "illumina1.8+", "", "", "","",false,false,true); parameters.push_back(pformat); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -52,24 +53,25 @@ string MakeContigsCommand::getHelpString(){ string helpString = ""; helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. It will also provide new quality files if the fastq or file parameter is used.\n"; helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n"; - helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, format, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n"; + helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n"; helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n"; - helpString += "The file parameter is 2 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column. Mothur will process each pair and create a combined fasta and qual file with all the sequences.\n"; + helpString += "The file parameter is 2 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n"; helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n"; helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n"; helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n"; - helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa or illumina, default=sanger.\n"; + helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n"; helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n"; helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; - helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; - helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; + //helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + //helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"; + helpString += "The deltaq parameter allows you to specify the delta allowed between quality scores of a mismatched base. For example in the overlap, if deltaq=5 and in the alignment seqA, pos 200 has a quality score of 30 and the same position in seqB has a quality score of 20, you take the base from seqA (30-20 >= 5). If the quality score in seqB is 28 then the base in the consensus will be an N (30-28<5) The default is 6.\n"; helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"; helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"; - helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n"; + helpString += "The insert parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=25.\n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"; helpString += "The make.contigs command should be in the following format: \n"; @@ -88,9 +90,8 @@ string MakeContigsCommand::getOutputPattern(string type) { string pattern = ""; if (type == "fasta") { pattern = "[filename],[tag],contigs.fasta"; } - else if (type == "qfile") { pattern = "[filename],[tag],contigs.qual"; } else if (type == "group") { pattern = "[filename],[tag],contigs.groups"; } - else if (type == "mismatch") { pattern = "[filename],[tag],contigs.mismatch"; } + else if (type == "report") { pattern = "[filename],[tag],contigs.report"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -107,9 +108,8 @@ MakeContigsCommand::MakeContigsCommand(){ setParameters(); vector tempOutNames; outputTypes["fasta"] = tempOutNames; - outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; - outputTypes["mismatch"] = tempOutNames; + outputTypes["report"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "MakeContigsCommand", "MakeContigsCommand"); @@ -142,8 +142,7 @@ MakeContigsCommand::MakeContigsCommand(string option) { //initialize outputTypes vector tempOutNames; outputTypes["fasta"] = tempOutNames; - outputTypes["qfile"] = tempOutNames; - outputTypes["mismatch"] = tempOutNames; + outputTypes["report"] = tempOutNames; outputTypes["group"] = tempOutNames; @@ -287,10 +286,13 @@ MakeContigsCommand::MakeContigsCommand(string option) { m->mothurConvert(temp, gapExtend); if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; } - temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "40"; } - m->mothurConvert(temp, threshold); - if ((threshold < 0) || (threshold > 40)) { m->mothurOut("[ERROR]: threshold must be between 0 and 40.\n"); abort=true; } + temp = validParameter.validFile(parameters, "insert", false); if (temp == "not found"){ temp = "25"; } + m->mothurConvert(temp, insert); + if ((insert < 0) || (insert > 40)) { m->mothurOut("[ERROR]: insert must be between 0 and 40.\n"); abort=true; } + temp = validParameter.validFile(parameters, "deltaq", false); if (temp == "not found"){ temp = "6"; } + m->mothurConvert(temp, deltaq); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); @@ -320,10 +322,10 @@ MakeContigsCommand::MakeContigsCommand(string option) { align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; } if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; } - format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "sanger"; } + format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "illumina1.8+"; } - if ((format != "sanger") && (format != "illumina") && (format != "solexa")) { - m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine(); + if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) { + m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine(); abort=true; } @@ -365,25 +367,19 @@ int MakeContigsCommand::execute(){ string compositeGroupFile = getOutputFileName("group",cvars); cvars["[tag]"] = "trim"; string compositeFastaFile = getOutputFileName("fasta",cvars); - string compositeQualFile = getOutputFileName("qfile",cvars); cvars["[tag]"] = "scrap"; string compositeScrapFastaFile = getOutputFileName("fasta",cvars); - string compositeScrapQualFile = getOutputFileName("qfile",cvars); cvars["[tag]"] = ""; - string compositeMisMatchFile = getOutputFileName("mismatch",cvars); + string compositeMisMatchFile = getOutputFileName("report",cvars); if (filesToProcess.size() > 1) { //clear files for append below ofstream outCTFasta, outCTQual, outCSFasta, outCSQual, outCMisMatch; m->openOutputFile(compositeFastaFile, outCTFasta); outCTFasta.close(); m->openOutputFile(compositeScrapFastaFile, outCSFasta); outCSFasta.close(); m->openOutputFile(compositeMisMatchFile, outCMisMatch); outCMisMatch.close(); - m->openOutputFile(compositeQualFile, outCTQual); outCTQual.close(); - m->openOutputFile(compositeScrapQualFile, outCSQual); outCSQual.close(); outputNames.push_back(compositeFastaFile); outputTypes["fasta"].push_back(compositeFastaFile); - outputNames.push_back(compositeQualFile); outputTypes["qfile"].push_back(compositeQualFile); - outputNames.push_back(compositeMisMatchFile); outputTypes["mismatch"].push_back(compositeMisMatchFile); + outputNames.push_back(compositeMisMatchFile); outputTypes["report"].push_back(compositeMisMatchFile); outputNames.push_back(compositeScrapFastaFile); outputTypes["fasta"].push_back(compositeScrapFastaFile); - outputNames.push_back(compositeScrapQualFile); outputTypes["qfile"].push_back(compositeScrapQualFile); } for (int l = 0; l < filesToProcess.size(); l++) { @@ -391,7 +387,6 @@ int MakeContigsCommand::execute(){ m->mothurOut("\n>>>>>\tProcessing " + filesToProcess[l][0][0] + " (file " + toString(l+1) + " of " + toString(filesToProcess.size()) + ")\t<<<<<\n"); vector > fastaFileNames; - vector > qualFileNames; createGroup = false; string outputGroupFileName; map variables; @@ -400,7 +395,7 @@ int MakeContigsCommand::execute(){ variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(filesToProcess[l][0][0])); variables["[tag]"] = ""; if(oligosfile != ""){ - createGroup = getOligos(fastaFileNames, qualFileNames, variables["[filename]"]); + createGroup = getOligos(fastaFileNames, variables["[filename]"]); if (createGroup) { outputGroupFileName = getOutputFileName("group",variables); outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); @@ -409,22 +404,16 @@ int MakeContigsCommand::execute(){ variables["[tag]"] = "trim"; string outFastaFile = getOutputFileName("fasta",variables); - string outQualFile = getOutputFileName("qfile",variables); variables["[tag]"] = "scrap"; string outScrapFastaFile = getOutputFileName("fasta",variables); - string outScrapQualFile = getOutputFileName("qfile",variables); variables["[tag]"] = ""; - string outMisMatchFile = getOutputFileName("mismatch",variables); + string outMisMatchFile = getOutputFileName("report",variables); outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile); - if (filesToProcess[l][0][1] != "") { - outputNames.push_back(outQualFile); outputTypes["qfile"].push_back(outQualFile); - outputNames.push_back(outScrapQualFile); outputTypes["qfile"].push_back(outScrapQualFile); - } - outputNames.push_back(outMisMatchFile); outputTypes["mismatch"].push_back(outMisMatchFile); + outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile); m->mothurOut("Making contigs...\n"); - createProcesses(filesToProcess[l], outFastaFile, outQualFile, outScrapFastaFile, outScrapQualFile, outMisMatchFile, fastaFileNames, qualFileNames); + createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames); m->mothurOut("Done.\n"); //remove temp fasta and qual files @@ -443,11 +432,6 @@ int MakeContigsCommand::execute(){ if(m->isBlank(fastaFileNames[i][j])){ m->mothurRemove(fastaFileNames[i][j]); namesToRemove.insert(fastaFileNames[i][j]); - - if (filesToProcess[l][0][1] != "") { - m->mothurRemove(qualFileNames[i][j]); - namesToRemove.insert(qualFileNames[i][j]); - } }else{ it = uniqueFastaNames.find(fastaFileNames[i][j]); if (it == uniqueFastaNames.end()) { @@ -504,9 +488,7 @@ int MakeContigsCommand::execute(){ } m->appendFiles(outMisMatchFile, compositeMisMatchFile); m->appendFiles(outFastaFile, compositeFastaFile); - m->appendFiles(outQualFile, compositeQualFile); m->appendFiles(outScrapFastaFile, compositeScrapFastaFile); - m->appendFiles(outScrapQualFile, compositeScrapQualFile); } } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(numReads) + " sequences.\n"); @@ -530,12 +512,6 @@ int MakeContigsCommand::execute(){ if ((itTypes->second).size() != 0) { currentFasta = (itTypes->second)[0]; m->setFastaFile(currentFasta); } } - string currentQual = ""; - itTypes = outputTypes.find("qfile"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { currentQual = (itTypes->second)[0]; m->setQualFile(currentQual); } - } - string currentGroup = ""; itTypes = outputTypes.find("group"); if (itTypes != outputTypes.end()) { @@ -616,7 +592,7 @@ vector< vector< vector > > MakeContigsCommand::preProcessData(unsigned l } } //********************************************************************************************************************** -int MakeContigsCommand::createProcesses(vector< vector > files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector > fastaFileNames, vector > qualFileNames) { +int MakeContigsCommand::createProcesses(vector< vector > files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector > fastaFileNames) { try { int num = 0; vector processIDS; @@ -632,7 +608,6 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o process++; }else if (pid == 0){ vector > tempFASTAFileNames = fastaFileNames; - vector > tempPrimerQualFileNames = qualFileNames; if(allFiles){ ofstream temp; @@ -642,11 +617,6 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o if (tempFASTAFileNames[i][j] != "") { tempFASTAFileNames[i][j] += toString(getpid()) + ".temp"; m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); - - if (files[processors-1][1] != "") { - tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp"; - m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); - } } } } @@ -654,12 +624,9 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o num = driver(files[process], outputFasta + toString(getpid()) + ".temp", - outputQual + toString(getpid()) + ".temp", outputScrapFasta + toString(getpid()) + ".temp", - outputScrapQual + toString(getpid()) + ".temp", outputMisMatches + toString(getpid()) + ".temp", - tempFASTAFileNames, - tempPrimerQualFileNames); + tempFASTAFileNames, process); //pass groupCounts to parent ofstream out; @@ -691,13 +658,9 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o ofstream temp; m->openOutputFile(outputFasta, temp); temp.close(); m->openOutputFile(outputScrapFasta, temp); temp.close(); - if (files[processors-1][1] != "") { - m->openOutputFile(outputScrapQual, temp); temp.close(); - m->openOutputFile(outputQual, temp); temp.close(); - } - + //do my part - num = driver(files[processors-1], outputFasta, outputQual, outputScrapFasta, outputScrapQual, outputMisMatches, fastaFileNames, qualFileNames); + num = driver(files[processors-1], outputFasta, outputScrapFasta, outputMisMatches, fastaFileNames, processors-1); //force parent to wait until all the processes are done for (int i=0;i > files, string o string extension = ""; if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); } vector > tempFASTAFileNames = fastaFileNames; - vector > tempPrimerQualFileNames = qualFileNames; - + if(allFiles){ ofstream temp; @@ -766,25 +728,19 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o if (tempFASTAFileNames[i][j] != "") { tempFASTAFileNames[i][j] += extension; m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); - - if (files[processors-1][1] != "") { - tempPrimerQualFileNames[i][j] += extension; - m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); - } } } } } - contigsData* tempcontig = new contigsData(files[h], (outputFasta + extension), (outputQual + extension), (outputScrapFasta + extension), (outputScrapQual + extension),(outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, threshold, barcodes, primers, tempFASTAFileNames, tempPrimerQualFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createGroup, allFiles, h); + contigsData* tempcontig = new contigsData(files[h], (outputFasta + extension), (outputScrapFasta + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, insert, deltaq, barcodes, primers, tempFASTAFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createGroup, allFiles, h); pDataArray.push_back(tempcontig); hThreadArray[h] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]); } vector > tempFASTAFileNames = fastaFileNames; - vector > tempPrimerQualFileNames = qualFileNames; if(allFiles){ ofstream temp; @@ -795,11 +751,6 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o if (tempFASTAFileNames[i][j] != "") { tempFASTAFileNames[i][j] += extension; m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); - - if (files[processors-1][1] != "") { - tempPrimerQualFileNames[i][j] += extension; - m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); - } } } } @@ -809,14 +760,10 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o ofstream temp; m->openOutputFile(outputFasta, temp); temp.close(); m->openOutputFile(outputScrapFasta, temp); temp.close(); - if (files[processors-1][1] != "") { - m->openOutputFile(outputScrapQual, temp); temp.close(); - m->openOutputFile(outputQual, temp); temp.close(); - } //do my part processIDS.push_back(processors-1); - num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputQual+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputScrapQual+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames); + num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, processors-1); //Wait until all threads have terminated. WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); @@ -849,14 +796,6 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o m->appendFiles((outputScrapFasta + toString(processIDS[i]) + ".temp"), outputScrapFasta); m->mothurRemove((outputScrapFasta + toString(processIDS[i]) + ".temp")); - - if (files[processors-1][1] != "") { - m->appendFiles((outputScrapQual + toString(processIDS[i]) + ".temp"), outputScrapQual); - m->mothurRemove((outputScrapQual + toString(processIDS[i]) + ".temp")); - - m->appendFiles((outputQual + toString(processIDS[i]) + ".temp"), outputQual); - m->mothurRemove((outputQual + toString(processIDS[i]) + ".temp")); - } m->appendFiles((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches); m->mothurRemove((outputMisMatches + toString(processIDS[i]) + ".temp")); @@ -867,11 +806,6 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o if (fastaFileNames[j][k] != "") { m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]); m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp")); - - if (files[processors-1][1] != "") { - m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]); - m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp")); - } } } } @@ -886,7 +820,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o } } //********************************************************************************************************************** -int MakeContigsCommand::driver(vector files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector > fastaFileNames, vector > qualFileNames){ +int MakeContigsCommand::driver(vector files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector > fastaFileNames, int process){ try { Alignment* alignment; @@ -902,19 +836,17 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); } ifstream inFFasta, inRFasta, inFQual, inRQual; - ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual; + ofstream outFasta, outMisMatch, outScrapFasta; m->openInputFile(thisffastafile, inFFasta); m->openInputFile(thisrfastafile, inRFasta); if (thisfqualfile != "") { m->openInputFile(thisfqualfile, inFQual); m->openInputFile(thisrqualfile, inRQual); - m->openOutputFile(outputScrapQual, outScrapQual); - m->openOutputFile(outputQual, outQual); } m->openOutputFile(outputFasta, outFasta); m->openOutputFile(outputScrapFasta, outScrapFasta); m->openOutputFile(outputMisMatches, outMisMatch); - outMisMatch << "Name\tLength\tMisMatches\n"; + if (process == 0) { outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; } TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, primers, barcodes); @@ -974,7 +906,6 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string //traverse alignments merging into one contiguous seq string contig = ""; - vector contigScores; int numMismatches = 0; string seq1 = fSeq.getAligned(); string seq2 = rSeq.getAligned(); @@ -991,15 +922,9 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string //bigger of the 2 starting positions is the location of the overlapping start if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1 overlapStart = seq2Start; - for (int i = 0; i < overlapStart; i++) { - contig += seq1[i]; - if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); } - } + for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; } }else { //seq1 starts later so take from 0 to overlapStart from seq2 - for (int i = 0; i < overlapStart; i++) { - contig += seq2[i]; - if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); } - } + for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; } } int seq1End = fSeq.getEndPos(); @@ -1007,53 +932,40 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string int overlapEnd = seq1End; if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends + int oStart = contig.length(); for (int i = overlapStart; i < overlapEnd; i++) { if (seq1[i] == seq2[i]) { //match, add base and choose highest score contig += seq1[i]; - if (thisfqualfile != "") { - contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; } - } - }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base + }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base if (thisfqualfile != "") { - if (scores2[BBaseMap[i]] < threshold) { } // - else { - contig += seq2[i]; - contigScores.push_back(scores2[BBaseMap[i]]); - } + if (scores2[BBaseMap[i]] < insert) { } // + else { contig += seq2[i]; } }else { contig += seq2[i]; } //with no quality info, then we keep it? - }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base + }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base if (thisfqualfile != "") { - if (scores1[ABaseMap[i]] < threshold) { } // - else { - contig += seq1[i]; - contigScores.push_back(scores1[ABaseMap[i]]); - } + if (scores1[ABaseMap[i]] < insert) { } // + else { contig += seq1[i]; } }else { contig += seq1[i]; } //with no quality info, then we keep it? }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality if (thisfqualfile != "") { - char c = seq1[i]; - contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; } - contig += c; + if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score + char c = seq1[i]; + if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; } + contig += c; + }else { //if no, base becomes n + contig += 'N'; + } numMismatches++; }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch }else { //should never get here m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n"); } } - + int oend = contig.length(); if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2 - for (int i = overlapEnd; i < length; i++) { - contig += seq2[i]; - if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); } - } + for (int i = overlapEnd; i < length; i++) { contig += seq2[i]; } }else { //seq2 ends before seq1 so take from overlap to length from seq1 - for (int i = overlapEnd; i < length; i++) { - contig += seq1[i]; - if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); } - } - + for (int i = overlapEnd; i < length; i++) { contig += seq1[i]; } } if(trashCode.length() == 0){ @@ -1094,32 +1006,16 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output); output << ">" << fSeq.getName() << endl << contig << endl; output.close(); - - if (thisfqualfile != "") { - m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output); - output << ">" << fSeq.getName() << endl; - for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; } - output << endl; - output.close(); - } } //output outFasta << ">" << fSeq.getName() << endl << contig << endl; - if (thisfqualfile != "") { - outQual << ">" << fSeq.getName() << endl; - for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; } - outQual << endl; - } - outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl; + int numNs = 0; + for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; } } + outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl; }else { //output outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl; - if (thisfqualfile != "") { - outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl; - for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; } - outScrapQual << endl; - } } num++; @@ -1138,12 +1034,10 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string if (thisfqualfile != "") { inFQual.close(); inRQual.close(); - outQual.close(); - outScrapQual.close(); } delete alignment; - if (m->control_pressed) { m->mothurRemove(outputFasta); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches); if (thisfqualfile != "") { m->mothurRemove(outputQual); m->mothurRemove(outputScrapQual); } } + if (m->control_pressed) { m->mothurRemove(outputFasta); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches); } return num; } @@ -1224,7 +1118,7 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); } - if (checkReads(fread, rread, ffastq, rfastq)) { + //if (checkReads(fread, rread, ffastq, rfastq)) { if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; } //if the reads are okay write to output files @@ -1243,7 +1137,7 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c //report progress if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } - } + //} } } //report progress @@ -1367,7 +1261,7 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); } - if (checkReads(fread, rread, ffasta, rfasta)) { + // if (checkReads(fread, rread, ffasta, rfasta)) { if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inReverseFasta.close(); inForwardFasta.close(); if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); } return files; } //if the reads are okay write to output files @@ -1387,7 +1281,7 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c //report progress if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } - } + //} } } //report progress @@ -1539,7 +1433,7 @@ fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ exit(1); } } -//********************************************************************************************************************** +/********************************************************************************************************************** bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, string ffile, string rfile){ try { bool good = true; @@ -1562,7 +1456,7 @@ bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, stri m->errorOut(e, "MakeContigsCommand", "checkReads"); exit(1); } -} +}*/ //*************************************************************************************************************** vector< vector > MakeContigsCommand::readFileNames(string filename){ try { @@ -1664,7 +1558,7 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ //BARCODE atgcatgc atgcatgc groupName //PRIMER atgcatgc atgcatgc groupName //PRIMER atgcatgc atgcatgc -bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vector >& qualFileNames, string rootname){ +bool MakeContigsCommand::getOligos(vector >& fastaFileNames, string rootname){ try { ifstream in; m->openInputFile(oligosfile, in); @@ -1681,7 +1575,7 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect while(!in.eof()){ in >> type; - cout << type << endl; + if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } if(type[0] == '#'){ @@ -1764,7 +1658,6 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect barcodes[indexBarcode]=newPair; indexBarcode++; barcodeNameVector.push_back(group); - cout << group << endl; }else if(type == "LINKER"){ linker.push_back(foligo); m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n"); @@ -1797,7 +1690,6 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect for(int i=0;i uniqueNames; //used to cleanup outputFileNames @@ -1838,17 +1730,6 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect fastaFileNames[itBar->first][itPrimer->first] = fastaFileName; m->openOutputFile(fastaFileName, temp); temp.close(); - - if ((fqualfile != "") || (ffastqfile != "") || (file != "")) { - qualFileName = rootname + ".qual"; - if (uniqueNames.count(qualFileName) == 0) { - outputNames.push_back(qualFileName); - outputTypes["qfile"].push_back(qualFileName); - } - - qualFileNames[itBar->first][itPrimer->first] = qualFileName; - m->openOutputFile(qualFileName, temp); temp.close(); - } } } } @@ -1926,6 +1807,7 @@ string MakeContigsCommand::reverseOligo(string oligo){ vector MakeContigsCommand::convertQual(string qual) { try { vector qualScores; + bool negativeScores = false; for (int i = 0; i < qual.length(); i++) { @@ -1933,15 +1815,21 @@ vector MakeContigsCommand::convertQual(string qual) { temp = int(qual[i]); if (format == "illumina") { temp -= 64; //char '@' + }else if (format == "illumina1.8+") { + temp -= int('!'); //char '!' }else if (format == "solexa") { temp = int(convertTable[temp]); //convert to sanger temp -= int('!'); //char '!' }else { temp -= int('!'); //char '!' } + + if (temp < -5) { negativeScores = true; } qualScores.push_back(temp); } + if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n"); m->control_pressed = true; } + return qualScores; } catch(exception& e) { diff --git a/makecontigscommand.h b/makecontigscommand.h index b61ebda..a23d397 100644 --- a/makecontigscommand.h +++ b/makecontigscommand.h @@ -62,7 +62,7 @@ private: bool abort, allFiles, createGroup; string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file, format; float match, misMatch, gapOpen, gapExtend; - int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs; + int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq; vector outputNames; map barcodes; @@ -82,10 +82,10 @@ private: vector< vector > readFileNames(string); vector< vector > readFastqFiles(unsigned long int&, string, string); vector< vector > readFastaFiles(unsigned long int&, string, string); - bool checkReads(fastqRead&, fastqRead&, string, string); - int createProcesses(vector< vector >, string, string, string, string, string, vector >, vector >); - int driver(vector, string, string, string, string, string, vector >, vector >); - bool getOligos(vector >&, vector< vector >&, string); + //bool checkReads(fastqRead&, fastqRead&, string, string); + int createProcesses(vector< vector >, string, string, string, vector >); + int driver(vector, string, string, string, vector >, int); + bool getOligos(vector >&, string); string reverseOligo(string); vector getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques); }; @@ -98,17 +98,14 @@ private: // that can be passed using a single void pointer (LPVOID). struct contigsData { string outputFasta; - string outputQual; string outputScrapFasta; - string outputScrapQual; string outputMisMatches; string align; vector files; vector > fastaFileNames; - vector > qualFileNames; MothurOut* m; float match, misMatch, gapOpen, gapExtend; - int count, threshold, threadID, pdiffs, bdiffs, tdiffs; + int count, insert, threadID, pdiffs, bdiffs, tdiffs, deltaq; bool allFiles, createGroup, done; map groupCounts; map groupMap; @@ -118,23 +115,20 @@ struct contigsData { map primers; contigsData(){} - contigsData(vector f, string of, string oq, string osf, string osq, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, map br, map pr, vector > ffn, vector > qfn, vectorbnv, vector pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) { + contigsData(vector f, string of, string osf, string om, string al, MothurOut* mout, float ma, float misMa, float gapO, float gapE, int thr, int delt, map br, map pr, vector > ffn, vectorbnv, vector pnv, int pdf, int bdf, int tdf, bool cg, bool all, int tid) { files = f; outputFasta = of; - outputQual = oq; outputMisMatches = om; m = mout; match = ma; misMatch = misMa; gapOpen = gapO; gapExtend = gapE; - threshold = thr; + insert = thr; align = al; count = 0; outputScrapFasta = osf; - outputScrapQual = osq; fastaFileNames = ffn; - qualFileNames = qfn; barcodes = br; primers = pr; barcodeNameVector = bnv; @@ -145,6 +139,7 @@ struct contigsData { allFiles = all; createGroup = cg; threadID = tid; + deltaq = delt; done=false; } }; @@ -176,27 +171,24 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ if (pDataArray->fastaFileNames[i][j] != "") { ofstream temp; pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close(); - if (thisfqualfile != "") { pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close(); } } } } } ifstream inFFasta, inRFasta, inFQual, inRQual; - ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual; + ofstream outFasta, outMisMatch, outScrapFasta; pDataArray->m->openInputFile(thisffastafile, inFFasta); pDataArray->m->openInputFile(thisrfastafile, inRFasta); if (thisfqualfile != "") { pDataArray->m->openInputFile(thisfqualfile, inFQual); pDataArray->m->openInputFile(thisrqualfile, inRQual); - pDataArray->m->openOutputFile(pDataArray->outputQual, outQual); - pDataArray->m->openOutputFile(pDataArray->outputScrapQual, outScrapQual); } pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta); pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch); pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta); - outMisMatch << "Name\tLength\tMisMatches\n"; + if (pDataArray->threadID == 0) { outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; } TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->primers, pDataArray->barcodes); @@ -256,7 +248,6 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ //traverse alignments merging into one contiguous seq string contig = ""; - vector contigScores; int numMismatches = 0; string seq1 = fSeq.getAligned(); string seq2 = rSeq.getAligned(); @@ -272,15 +263,9 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ //bigger of the 2 starting positions is the location of the overlapping start if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1 overlapStart = seq2Start; - for (int i = 0; i < overlapStart; i++) { - contig += seq1[i]; - if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); } - } + for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; } }else { //seq1 starts later so take from 0 to overlapStart from seq2 - for (int i = 0; i < overlapStart; i++) { - contig += seq2[i]; - if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); } - } + for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; } } int seq1End = fSeq.getEndPos(); @@ -288,53 +273,41 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ int overlapEnd = seq1End; if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends + int oStart = contig.length(); for (int i = overlapStart; i < overlapEnd; i++) { if (seq1[i] == seq2[i]) { //match, add base and choose highest score contig += seq1[i]; - if (thisfqualfile != "") { - contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; } - } - }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base + }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base if (thisfqualfile != "") { - if (scores2[BBaseMap[i]] < pDataArray->threshold) { } // - else { - contig += seq2[i]; - contigScores.push_back(scores2[BBaseMap[i]]); - } - }else { contig += seq2[i]; } - }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base + if (scores2[BBaseMap[i]] < pDataArray->insert) { } // + else { contig += seq2[i]; } + }else { contig += seq2[i]; } //with no quality info, then we keep it? + }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base if (thisfqualfile != "") { - if (scores1[ABaseMap[i]] < pDataArray->threshold) { } // - else { - contig += seq1[i]; - contigScores.push_back(scores1[ABaseMap[i]]); - } - }else { contig += seq1[i]; } + if (scores1[ABaseMap[i]] < pDataArray->insert) { } // + else { contig += seq1[i]; } + }else { contig += seq1[i]; } //with no quality info, then we keep it? }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality if (thisfqualfile != "") { - char c = seq1[i]; - contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; } - contig += c; + if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= pDataArray->deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score + char c = seq1[i]; + if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; } + contig += c; + }else { //if no, base becomes n + contig += 'N'; + } numMismatches++; - }else { numMismatches++; } + }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch }else { //should never get here pDataArray->m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n"); } } + int oend = contig.length(); if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2 - for (int i = overlapEnd; i < length; i++) { - contig += seq2[i]; - if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); } - } + for (int i = overlapEnd; i < length; i++) { contig += seq2[i]; } }else { //seq2 ends before seq1 so take from overlap to length from seq1 - for (int i = overlapEnd; i < length; i++) { - contig += seq1[i]; - if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); } - } - + for (int i = overlapEnd; i < length; i++) { contig += seq1[i]; } } if(trashCode.length() == 0){ @@ -370,32 +343,16 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output); output << ">" << fSeq.getName() << endl << contig << endl; output.close(); - - if (thisfqualfile != "") { - pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output); - output << ">" << fSeq.getName() << endl; - for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; } - output << endl; - output.close(); - } } //output outFasta << ">" << fSeq.getName() << endl << contig << endl; - if (thisfqualfile != "") { - outQual << ">" << fSeq.getName() << endl; - for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; } - outQual << endl; - } - outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl; + int numNs = 0; + for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; } } + outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl; }else { //output outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl; - if (thisfqualfile != "") { - outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl; - for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; } - outScrapQual << endl; - } } pDataArray->count++; @@ -414,13 +371,11 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ if (thisfqualfile != "") { inFQual.close(); inRQual.close(); - outQual.close(); - outScrapQual.close(); } delete alignment; pDataArray->done = true; - if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta); if (thisfqualfile != "") { pDataArray->m->mothurRemove(pDataArray->outputQual); pDataArray->m->mothurRemove(pDataArray->outputScrapQual); } } + if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(pDataArray->outputFasta); pDataArray->m->mothurRemove(pDataArray->outputMisMatches); pDataArray->m->mothurRemove(pDataArray->outputScrapFasta); } return 0; diff --git a/makefastqcommand.cpp b/makefastqcommand.cpp index 6712196..5a66d9a 100644 --- a/makefastqcommand.cpp +++ b/makefastqcommand.cpp @@ -16,7 +16,7 @@ vector MakeFastQCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fastq",false,true,true); parameters.push_back(pfasta); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","fastq",false,true,true); parameters.push_back(pqfile); - CommandParameter pformat("format", "Multiple", "sanger-illumina", "sanger", "", "", "","",false,false); parameters.push_back(pformat); + CommandParameter pformat("format", "Multiple", "sanger-illumina-illumina1.8+", "sanger", "", "", "","",false,false); parameters.push_back(pformat); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -35,7 +35,7 @@ string MakeFastQCommand::getHelpString(){ string helpString = ""; helpString += "The make.fastq command reads a fasta and quality file and creates a fastq file.\n"; helpString += "The make.fastq command parameters are fasta, qfile and format. fasta and qfile are required.\n"; - helpString += "The format parameter is used to indicate whether your sequences are sanger or illumina, default=sanger.\n"; + helpString += "The format parameter is used to indicate whether your sequences are sanger, illumina1.8+ or illumina, default=sanger.\n"; helpString += "The make.fastq command should be in the following format: make.fastq(qfile=yourQualityFile, fasta=yourFasta).\n"; helpString += "Example make.fastq(fasta=amazon.fasta, qfile=amazon.qual).\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; @@ -147,8 +147,8 @@ MakeFastQCommand::MakeFastQCommand(string option) { format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "sanger"; } - if ((format != "sanger") && (format != "illumina") && (format != "solexa")) { - m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine(); + if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+")) { + m->mothurOut(format + " is not a valid format. Your format choices are sanger, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine(); abort=true; } diff --git a/makefile b/makefile index 4584505..d4035c8 100644 --- a/makefile +++ b/makefile @@ -15,8 +15,8 @@ USEREADLINE ?= yes CYGWIN_BUILD ?= no USECOMPRESSION ?= no MOTHUR_FILES="\"Enter_your_default_path_here\"" -RELEASE_DATE = "\"11/2/2012\"" -VERSION = "\"1.28.0\"" +RELEASE_DATE = "\"1/23/2013\"" +VERSION = "\"1.29.1\"" FORTAN_COMPILER = gfortran FORTRAN_FLAGS = diff --git a/mothurout.cpp b/mothurout.cpp index 1b72137..dc77490 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1430,6 +1430,83 @@ vector MothurOut::divideFile(string filename, int& proc) { } } /**************************************************************************************************/ + +vector MothurOut::divideFilePerLine(string filename, int& proc) { + try{ + vector filePos; + filePos.push_back(0); + + FILE * pFile; + unsigned long long size; + + filename = getFullPathName(filename); + + //get num bytes in file + pFile = fopen (filename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + + //estimate file breaks + unsigned long long chunkSize = 0; + chunkSize = size / proc; + + //file to small to divide by processors + if (chunkSize == 0) { proc = 1; filePos.push_back(size); return filePos; } + + //for each process seekg to closest file break and search for next '>' char. make that the filebreak + for (int i = 0; i < proc; i++) { + unsigned long long spot = (i+1) * chunkSize; + + ifstream in; + openInputFile(filename, in); + in.seekg(spot); + + //look for next line break + unsigned long long newSpot = spot; + while (!in.eof()) { + char c = in.get(); + + if ((c == '\n') || (c == '\r') || (c == '\f')) { gobble(in); newSpot = in.tellg(); break; } + else if (int(c) == -1) { break; } + } + + //there was not another line before the end of the file + unsigned long long sanityPos = in.tellg(); + + if (sanityPos == -1) { break; } + else { filePos.push_back(newSpot); } + + in.close(); + } + + //save end pos + filePos.push_back(size); + + //sanity check filePos + for (int i = 0; i < (filePos.size()-1); i++) { + if (filePos[(i+1)] <= filePos[i]) { filePos.erase(filePos.begin()+(i+1)); i--; } + } + + proc = (filePos.size() - 1); +#else + mothurOut("[ERROR]: Windows version should not be calling the divideFile function."); mothurOutEndLine(); + proc=1; + filePos.push_back(size); +#endif + return filePos; + } + catch(exception& e) { + errorOut(e, "MothurOut", "divideFile"); + exit(1); + } +} +/**************************************************************************************************/ int MothurOut::divideFile(string filename, int& proc, vector& files) { try{ @@ -2643,7 +2720,7 @@ void MothurOut::splitAtDash(string& estim, vector& container) { string individual = ""; int estimLength = estim.size(); bool prevEscape = false; - for(int i=0;i& container) { prevEscape = false; } } - } + }*/ + + + for(int i=0;i& container) { exit(1); } } + /***********************************************************************/ string MothurOut::makeList(vector& names) { try { @@ -2810,11 +2909,11 @@ void MothurOut::splitAtChar(string& prefix, string& suffix, char c){ string space = " "; while(suffix.at(0) == ' ') suffix = suffix.substr(1, suffix.length()); - } + }else { suffix = ""; } - } + } catch(exception& e) { - errorOut(e, "MothurOut", "splitAtComma"); + errorOut(e, "MothurOut", "splitAtChar"); exit(1); } } @@ -2830,7 +2929,7 @@ void MothurOut::splitAtComma(string& prefix, string& suffix){ string space = " "; while(suffix.at(0) == ' ') suffix = suffix.substr(1, suffix.length()); - } + }else { suffix = ""; } } catch(exception& e) { diff --git a/mothurout.h b/mothurout.h index ffdcaf6..be657f4 100644 --- a/mothurout.h +++ b/mothurout.h @@ -74,7 +74,8 @@ class MothurOut { //functions from mothur.h //file operations bool dirCheck(string&); //completes path, appends appropriate / or \, makes sure dir is writable. - vector divideFile(string, int&); + vector divideFile(string, int&); //divides splitting unevenness by sequence + vector divideFilePerLine(string, int&); //divides splitting unevenness at line breaks int divideFile(string, int&, vector&); vector setFilePosEachLine(string, int&); vector setFilePosFasta(string, int&); diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp index 63ed314..89f97ac 100644 --- a/parsefastaqcommand.cpp +++ b/parsefastaqcommand.cpp @@ -16,7 +16,7 @@ vector ParseFastaQCommand::setParameters(){ CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pfastq); CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "","fasta",false,false); parameters.push_back(pfasta); CommandParameter pqual("qfile", "Boolean", "", "T", "", "", "","qfile",false,false); parameters.push_back(pqual); - CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat); + CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -36,7 +36,7 @@ string ParseFastaQCommand::getHelpString(){ helpString += "The fastq.info command reads a fastq file and creates a fasta and quality file.\n"; helpString += "The fastq.info command parameters are fastq, fasta, qfile and format; fastq is required.\n"; helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n"; - helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa or illumina, default=sanger.\n"; + helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=sanger.\n"; helpString += "The fasta parameter allows you to indicate whether you want a fasta file generated. Default=T.\n"; helpString += "The qfile parameter allows you to indicate whether you want a quality file generated. Default=T.\n"; helpString += "Example fastq.info(fastaq=test.fastaq).\n"; @@ -136,8 +136,8 @@ ParseFastaQCommand::ParseFastaQCommand(string option){ format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "sanger"; } - if ((format != "sanger") && (format != "illumina") && (format != "solexa")) { - m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine(); + if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) { + m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine(); abort=true; } @@ -249,21 +249,28 @@ vector ParseFastaQCommand::convertQual(string qual) { try { vector qualScores; + bool negativeScores = false; + for (int i = 0; i < qual.length(); i++) { int temp = 0; temp = int(qual[i]); if (format == "illumina") { temp -= 64; //char '@' + }else if (format == "illumina1.8+") { + temp -= int('!'); //char '!' }else if (format == "solexa") { temp = int(convertTable[temp]); //convert to sanger temp -= int('!'); //char '!' }else { temp -= int('!'); //char '!' } + if (temp < -5) { negativeScores = true; } qualScores.push_back(temp); } + if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n"); m->control_pressed = true; } + return qualScores; } catch(exception& e) { diff --git a/primerdesigncommand.cpp b/primerdesigncommand.cpp index 48bb00f..59369b3 100644 --- a/primerdesigncommand.cpp +++ b/primerdesigncommand.cpp @@ -39,7 +39,7 @@ vector PrimerDesignCommand::setParameters(){ string PrimerDesignCommand::getHelpString(){ try { string helpString = ""; - helpString += "The primer.design allows you to design sequence fragments that are specific to particular OTUs.\n"; + helpString += "The primer.design allows you to identify sequence fragments that are specific to particular OTUs.\n"; helpString += "The primer.design command parameters are: list, fasta, name, count, otunumber, cutoff, length, pdiffs, mintm, maxtm, processors and label.\n"; helpString += "The list parameter allows you to provide a list file and is required.\n"; helpString += "The fasta parameter allows you to provide a fasta file and is required.\n"; diff --git a/primerdesigncommand.h b/primerdesigncommand.h index 78c12de..2879d2b 100644 --- a/primerdesigncommand.h +++ b/primerdesigncommand.h @@ -31,7 +31,7 @@ public: string getOutputPattern(string); string getHelpString(); string getCitation() { return "http://www.mothur.org/wiki/Primer.design"; } - string getDescription() { return "design sequence fragments that are specific to particular OTUs"; } + string getDescription() { return "identify sequence fragments that are specific to particular OTUs"; } int execute(); void help() { m->mothurOut(getHelpString()); } diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 1e6b5a3..5149550 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -14,11 +14,14 @@ vector ScreenSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta); + CommandParameter pcontigsreport("contigsreport", "InputTypes", "", "", "report", "none", "none","contigsreport",false,true,true); parameters.push_back(pcontigsreport); + CommandParameter palignreport("alignreport", "InputTypes", "", "", "report", "none", "none","alignreport",false,false); parameters.push_back(palignreport); + CommandParameter psummary("summary", "InputTypes", "", "", "report", "none", "none","summary",false,false); parameters.push_back(psummary); CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname); CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount); CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false); parameters.push_back(pqfile); - CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none","alignreport",false,false); parameters.push_back(palignreport); + CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false); parameters.push_back(ptax); CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pstart); CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pend); @@ -29,8 +32,20 @@ vector ScreenSeqsCommand::setParameters(){ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); CommandParameter pcriteria("criteria", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pcriteria); CommandParameter poptimize("optimize", "Multiple", "none-start-end-maxambig-maxhomop-minlength-maxlength", "none", "", "", "","",true,false); parameters.push_back(poptimize); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + + //report parameters + CommandParameter pminoverlap("minoverlap", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminoverlap); + CommandParameter postart("ostart", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(postart); + CommandParameter poend("oend", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(poend); + CommandParameter pmismatches("mismatches", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmismatches); + CommandParameter pmaxn("maxn", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxn); + CommandParameter pminscore("minscore", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminscore); + CommandParameter pmaxinsert("maxinsert", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxinsert); + CommandParameter pminsim("minsim", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pminsim); + + vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -46,15 +61,25 @@ string ScreenSeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The screen.seqs command reads a fastafile and screens sequences.\n"; - helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; + helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, contigsreport, summary, taxonomy, optimize, criteria and processors.\n"; helpString += "The fasta parameter is required.\n"; - helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n"; + helpString += "The contigsreport parameter allows you to use the contigsreport file to determine if a sequence is good. Screening parameters include: minoverlap, ostart, oend and mismatches. \n"; + helpString += "The alignreport parameter allows you to use the alignreport file to determine if a sequence is good. Screening parameters include: minsim, minscore and maxinsert. \n"; + helpString += "The summary parameter allows you to use the summary file from summary.seqs to save time processing.\n"; + helpString += "The taxonomy parameter allows you to remove bad seqs from taxonomy files.\n"; helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n"; helpString += "The end parameter is used to set a position the \"good\" sequences must end after. The default is -1.\n"; helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; - helpString += "The maxlength parameter allows you to set and maximum sequence length. \n"; + helpString += "The maxn parameter allows you to set and maximum number of N's allowed in a sequence. \n"; + helpString += "The minoverlap parameter allows you to set and minimum overlap. The default is -1. \n"; + helpString += "The ostart parameter is used to set an overlap position the \"good\" sequences must start by. The default is -1. \n"; + helpString += "The oend parameter is used to set an overlap position the \"good\" sequences must end after. The default is -1.\n"; + helpString += "The mismatches parameter allows you to set and maximum mismatches in the contigs.report. \n"; + helpString += "The minsim parameter allows you to set the minimum similarity to template sequences during alignment. Found in column \'SimBtwnQuery&Template\' in align.report file.\n"; + helpString += "The minscore parameter allows you to set the minimum search score during alignment. Found in column \'SearchScore\' in align.report file.\n"; + helpString += "The maxinsert parameter allows you to set the maximum number of insertions during alignment. Found in column \'LongestInsert\' in align.report file.\n"; helpString += "The processors parameter allows you to specify the number of processors to use while running the command. The default is 1.\n"; helpString += "The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .\n"; helpString += "For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.\n"; @@ -84,6 +109,8 @@ string ScreenSeqsCommand::getOutputPattern(string type) { else if (type == "accnos") { pattern = "[filename],bad.accnos"; } else if (type == "qfile") { pattern = "[filename],good,[extension]"; } else if (type == "alignreport") { pattern = "[filename],good.align.report"; } + else if (type == "contigsreport") { pattern = "[filename],good.contigs.report"; } + else if (type == "summary") { pattern = "[filename],good.summary"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -103,6 +130,8 @@ ScreenSeqsCommand::ScreenSeqsCommand(){ outputTypes["name"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; + outputTypes["contigsreport"] = tempOutNames; + outputTypes["summary"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; @@ -147,7 +176,10 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; outputTypes["count"] = tempOutNames; - + outputTypes["contigsreport"] = tempOutNames; + outputTypes["summary"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } @@ -184,6 +216,22 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["alignreport"] = inputDir + it->second; } } + + it = parameters.find("contigsreport"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["contigsreport"] = inputDir + it->second; } + } + + it = parameters.find("summary"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["summary"] = inputDir + it->second; } + } it = parameters.find("qfile"); //user has given a template file @@ -240,6 +288,14 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { else if (countfile == "not found") { countfile = ""; } else { m->setCountTableFile(countfile); } + contigsreport = validParameter.validFile(parameters, "contigsreport", true); + if (contigsreport == "not open") { contigsreport = ""; abort = true; } + else if (contigsreport == "not found") { contigsreport = ""; } + + summaryfile = validParameter.validFile(parameters, "summary", true); + if (summaryfile == "not open") { summaryfile = ""; abort = true; } + else if (summaryfile == "not found") { summaryfile = ""; } + if ((namefile != "") && (countfile != "")) { m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; } @@ -287,16 +343,71 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); + temp = validParameter.validFile(parameters, "minoverlap", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, minOverlap); + + temp = validParameter.validFile(parameters, "ostart", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, oStart); + + temp = validParameter.validFile(parameters, "oend", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, oEnd); + + temp = validParameter.validFile(parameters, "mismatches", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, mismatches); + + temp = validParameter.validFile(parameters, "maxn", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, maxN); + + temp = validParameter.validFile(parameters, "minscore", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, minScore); + + temp = validParameter.validFile(parameters, "maxinsert", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, maxInsert); + + temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, minSim); + temp = validParameter.validFile(parameters, "optimize", false); //optimizing trumps the optimized values original value if (temp == "not found"){ temp = "none"; } m->splitAtDash(temp, optimize); + + if ((contigsreport != "") && ((summaryfile != "") || ( alignreport != ""))) { + m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true; + } + + if ((alignreport != "") && ((summaryfile != "") || ( contigsreport != ""))) { + m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true; + } + + if ((summaryfile != "") && ((alignreport != "") || ( contigsreport != ""))) { + m->mothurOut("[ERROR]: You may only provide one of the following: contigsreport, alignreport or summary, aborting.\n"); abort=true; + } + //check to make sure you have the files you need for certain screening + if ((contigsreport == "") && ((minOverlap != -1) || (oStart != -1) || (oEnd != -1) || (mismatches != -1))) { + m->mothurOut("[ERROR]: minoverlap, ostart, oend and mismatches can only be used with a contigs.report file, aborting.\n"); abort=true; + } + + if ((alignreport == "") && ((minScore != -1) || (maxInsert != -1) || (minSim != -1))) { + m->mothurOut("[ERROR]: minscore, maxinsert and minsim can only be used with a align.report file, aborting.\n"); abort=true; + } + //check for invalid optimize options set validOptimizers; - validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength"); + validOptimizers.insert("none"); validOptimizers.insert("start"); validOptimizers.insert("end"); validOptimizers.insert("maxambig"); validOptimizers.insert("maxhomop"); validOptimizers.insert("minlength"); validOptimizers.insert("maxlength"); validOptimizers.insert("maxn"); + if (contigsreport != "") { validOptimizers.insert("minoverlap"); validOptimizers.insert("ostart"); validOptimizers.insert("oend"); validOptimizers.insert("mismatches"); } + if (alignreport != "") { validOptimizers.insert("minscore"); validOptimizers.insert("maxinsert"); validOptimizers.insert("minsim"); } + for (int i = 0; i < optimize.size(); i++) { if (validOptimizers.count(optimize[i]) == 0) { - m->mothurOut(optimize[i] + " is not a valid optimizer. Valid options are start, end, maxambig, maxhomop, minlength and maxlength."); m->mothurOutEndLine(); + m->mothurOut(optimize[i] + " is not a valid optimizer with your input files. Valid options are "); + string valid = ""; + for (set::iterator it = validOptimizers.begin(); it != validOptimizers.end(); it++) { + valid += (*it) + ", "; + } + if (valid.length() != 0) { valid = valid.substr(0, valid.length()-2); } + m->mothurOut(valid + "."); + m->mothurOutEndLine(); optimize.erase(optimize.begin()+i); i--; } @@ -328,190 +439,38 @@ int ScreenSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - //if the user want to optimize we need to know the 90% mark - vector positions; - if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here - //use the namefile to optimize correctly - if (namefile != "") { nameMap = m->readNames(namefile); } - else if (countfile != "") { - CountTable ct; - ct.readTable(countfile); - nameMap = ct.getNameMap(); - } - getSummary(positions); - } - else { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) - positions = m->divideFile(fastafile, processors); - for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } - #else - if(processors == 1){ lines.push_back(linePair(0, 1000)); } - else { - int numFastaSeqs = 0; - positions = m->setFilePosFasta(fastafile, numFastaSeqs); - if (positions.size() < processors) { processors = positions.size(); } - - //figure out how many sequences you have to process - int numSeqsPerProcessor = numFastaSeqs / processors; - for (int i = 0; i < processors; i++) { - int startIndex = i * numSeqsPerProcessor; - if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } - lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); - } - } - #endif - } + map badSeqNames; + int start = time(NULL); + int numFastaSeqs = 0; - map variables; - variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile)); - string badAccnosFile = getOutputFileName("accnos",variables); - variables["[extension]"] = m->getExtension(fastafile); - string goodSeqFile = getOutputFileName("fasta", variables); - + if ((contigsreport == "") && (summaryfile == "") && (alignreport == "")) { numFastaSeqs = screenFasta(badSeqNames); } + else { numFastaSeqs = screenReports(badSeqNames); } - int numFastaSeqs = 0; - set badSeqNames; - int start = time(NULL); - -#ifdef USE_MPI - int pid, numSeqsPerProcessor; - int tag = 2001; - vector MPIPos; - - MPI_Status status; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - MPI_Comm_size(MPI_COMM_WORLD, &processors); - - MPI_File inMPI; - MPI_File outMPIGood; - MPI_File outMPIBadAccnos; - - int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; - int inMode=MPI_MODE_RDONLY; - - char outGoodFilename[1024]; - strcpy(outGoodFilename, goodSeqFile.c_str()); - - char outBadAccnosFilename[1024]; - strcpy(outBadAccnosFilename, badAccnosFile.c_str()); - - char inFileName[1024]; - strcpy(inFileName, fastafile.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood); - MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos); - - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; } - - if (pid == 0) { //you are the root process - - MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs - - //send file positions to all processes - for(int i = 1; i < processors; i++) { - MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); - MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); - } - - //figure out how many sequences you have to align - numSeqsPerProcessor = numFastaSeqs / processors; - int startIndex = pid * numSeqsPerProcessor; - if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } - - //align your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames); - - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; } - - for (int i = 1; i < processors; i++) { - //get bad lists - int badSize; - MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); - } - }else{ //you are a child process - MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - MPIPos.resize(numFastaSeqs+1); - MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); - - //figure out how many sequences you have to align - numSeqsPerProcessor = numFastaSeqs / processors; - int startIndex = pid * numSeqsPerProcessor; - if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } - - //align your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames); - - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; } - - //send bad list - int badSize = badSeqNames.size(); - MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); - } - - //close files - MPI_File_close(&inMPI); - MPI_File_close(&outMPIGood); - MPI_File_close(&outMPIBadAccnos); - MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case - -#else - if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames); } - else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } -#endif - - #ifdef USE_MPI - MPI_Comm_rank(MPI_COMM_WORLD, &pid); - - if (pid == 0) { //only one process should fix files - - //read accnos file with all names in it, process 0 just has its names - MPI_File inMPIAccnos; - MPI_Offset size; - - char inFileName[1024]; - strcpy(inFileName, badAccnosFile.c_str()); - - MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer - MPI_File_get_size(inMPIAccnos, &size); - - char* buffer = new char[size]; - MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status); - - string tempBuf = buffer; - if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } - istringstream iss (tempBuf,istringstream::in); - - delete buffer; - MPI_File_close(&inMPIAccnos); - - badSeqNames.clear(); - string tempName; - while (!iss.eof()) { - iss >> tempName; m->gobble(iss); - badSeqNames.insert(tempName); - } - #endif - + #ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + if (pid == 0) { //only one process should fix files + #endif + if(namefile != "" && groupfile != "") { screenNameGroupFile(badSeqNames); - if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } }else if(namefile != "") { screenNameGroupFile(badSeqNames); - if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group else if (countfile != "") { screenCountFile(badSeqNames); } - if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - if(alignreport != "") { screenAlignReport(badSeqNames); } if(qualfile != "") { screenQual(badSeqNames); } if(taxonomy != "") { screenTaxonomy(badSeqNames); } - if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } #ifdef USE_MPI } @@ -519,8 +478,6 @@ int ScreenSeqsCommand::execute(){ m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(goodSeqFile); m->mothurOutEndLine(); outputTypes["fasta"].push_back(goodSeqFile); - m->mothurOut(badAccnosFile); m->mothurOutEndLine(); outputTypes["accnos"].push_back(badAccnosFile); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); m->mothurOutEndLine(); @@ -552,120 +509,1303 @@ int ScreenSeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } - itTypes = outputTypes.find("count"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "execute"); + exit(1); + } +} +//***************************************************************************************************************/ +int ScreenSeqsCommand::runFastaScreening(map& badSeqNames){ + try{ + int numFastaSeqs = 0; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile)); + string badAccnosFile = getOutputFileName("accnos",variables); + variables["[extension]"] = m->getExtension(fastafile); + string goodSeqFile = getOutputFileName("fasta", variables); + outputNames.push_back(goodSeqFile); outputTypes["fasta"].push_back(goodSeqFile); + outputNames.push_back(badAccnosFile); outputTypes["accnos"].push_back(badAccnosFile); + +#ifdef USE_MPI + int pid, numSeqsPerProcessor; + int tag = 2001; + vector MPIPos; + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + MPI_File inMPI; + MPI_File outMPIGood; + MPI_File outMPIBadAccnos; + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + int inMode=MPI_MODE_RDONLY; + + char outGoodFilename[1024]; + strcpy(outGoodFilename, goodSeqFile.c_str()); + + char outBadAccnosFilename[1024]; + strcpy(outBadAccnosFilename, badAccnosFile.c_str()); + + char inFileName[1024]; + strcpy(inFileName, fastafile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, outGoodFilename, outMode, MPI_INFO_NULL, &outMPIGood); + MPI_File_open(MPI_COMM_WORLD, outBadAccnosFilename, outMode, MPI_INFO_NULL, &outMPIBadAccnos); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; } + + if (pid == 0) { //you are the root process + + MPIPos = m->setFilePosFasta(fastafile, numFastaSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } + + //figure out how many sequences you have to align + numSeqsPerProcessor = numFastaSeqs / processors; + int startIndex = pid * numSeqsPerProcessor; + if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; } + + for (int i = 1; i < processors; i++) { + //get bad lists + int badSize; + MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); + } + }else{ //you are a child process + MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + MPIPos.resize(numFastaSeqs+1); + MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + + //figure out how many sequences you have to align + numSeqsPerProcessor = numFastaSeqs / processors; + int startIndex = pid * numSeqsPerProcessor; + if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } + + //align your part + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames); + + if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; } + + //send bad list + int badSize = badSeqNames.size(); + MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); + } + + //close files + MPI_File_close(&inMPI); + MPI_File_close(&outMPIGood); + MPI_File_close(&outMPIBadAccnos); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case + +#else + if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames); } + else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); } + + if (m->control_pressed) { m->mothurRemove(goodSeqFile); return numFastaSeqs; } +#endif + +#ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + if (pid == 0) { //only one process should fix files + + //read accnos file with all names in it, process 0 just has its names + MPI_File inMPIAccnos; + MPI_Offset size; + + char inFileName[1024]; + strcpy(inFileName, badAccnosFile.c_str()); + + MPI_File_open(MPI_COMM_SELF, inFileName, inMode, MPI_INFO_NULL, &inMPIAccnos); //comm, filename, mode, info, filepointer + MPI_File_get_size(inMPIAccnos, &size); + + char* buffer = new char[size]; + MPI_File_read(inMPIAccnos, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + delete buffer; + MPI_File_close(&inMPIAccnos); + + badSeqNames.clear(); + string tempName, trashCode; + while (!iss.eof()) { + iss >> tempName >> trashCode; m->gobble(iss); + badSeqNames[tempName] = trashCode; + } + } +#endif + + + return numFastaSeqs; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "runFastaScreening"); + exit(1); + } +} +//***************************************************************************************************************/ +int ScreenSeqsCommand::screenReports(map& badSeqNames){ + try{ + int numFastaSeqs = 0; + bool summarizedFasta = false; + + //did not provide a summary file, but set a parameter that requires summarizing the fasta file + //or did provide a summary file, but set maxn parameter so we must summarize the fasta file + vector positions; + if (((summaryfile == "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize)))) || ((summaryfile != "") && m->inUsersGroups("maxn", optimize))) { + //use the namefile to optimize correctly + if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } + getSummary(positions); + summarizedFasta = true; + } else { + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + positions = m->divideFile(fastafile, processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } + #else + if(processors == 1){ lines.push_back(linePair(0, 1000)); } + else { + int numFastaSeqs = 0; + positions = m->setFilePosFasta(fastafile, numFastaSeqs); + if (positions.size() < processors) { processors = positions.size(); } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); + } + } + #endif + } + + if ((summaryfile != "") && ((m->inUsersGroups("maxambig", optimize)) ||(m->inUsersGroups("maxhomop", optimize)) ||(m->inUsersGroups("maxlength", optimize)) || (m->inUsersGroups("minlength", optimize)) || (m->inUsersGroups("start", optimize)) || (m->inUsersGroups("end", optimize))) && !summarizedFasta) { //summarize based on summaryfile + if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } + getSummaryReport(); + }else if ((contigsreport != "") && ((m->inUsersGroups("minoverlap", optimize)) || (m->inUsersGroups("ostart", optimize)) || (m->inUsersGroups("oend", optimize)) || (m->inUsersGroups("mismatches", optimize)))) { //optimize settings based on contigs file + optimizeContigs(); + }else if ((alignreport != "") && ((m->inUsersGroups("minsim", optimize)) || (m->inUsersGroups("minscore", optimize)) || (m->inUsersGroups("maxinsert", optimize)))) { //optimize settings based on contigs file + optimizeAlign(); + } + + + //provided summary file, and did not set maxn so no need to summarize fasta + if (summaryfile != "") { numFastaSeqs = screenSummary(badSeqNames); } + //add in any seqs that fail due to contigs report results + else if (contigsreport != "") { numFastaSeqs = screenContigs(badSeqNames); } + //add in any seqs that fail due to align report + else if (alignreport != "") { numFastaSeqs = screenAlignReport(badSeqNames); } + + return numFastaSeqs; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenReports"); + exit(1); + } +} +//*************************************************************************************************************** +int ScreenSeqsCommand::screenAlignReport(map& badSeqNames){ + try { + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport)); + string outSummary = getOutputFileName("alignreport",variables); + outputNames.push_back(outSummary); outputTypes["alignreport"].push_back(outSummary); + + string name, TemplateName, SearchMethod, AlignmentMethod; + //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template + //checking for minScore, maxInsert, minSim + int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert; + float SearchScore, SimBtwnQueryTemplate; + + ofstream out; + m->openOutputFile(outSummary, out); + + //read summary file + ifstream in; + m->openInputFile(alignreport, in); + out << (m->getline(in)) << endl; //skip headers + + int count = 0; + + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); return 0; } + + //seqname start end nbases ambigs polymer numSeqs + in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in); + + bool goodSeq = 1; // innocent until proven guilty + string trashCode = ""; + if(maxInsert != -1 && maxInsert < LongestInsert) { goodSeq = 0; trashCode += "insert|"; } + if(minScore != -1 && minScore > SearchScore) { goodSeq = 0; trashCode += "score|"; } + if(minSim != -1 && minSim > SimBtwnQueryTemplate) { goodSeq = 0; trashCode += "sim|"; } + + if(goodSeq == 1){ + out << name << '\t' << length << '\t' << TemplateName << '\t' << TemplateLength << '\t' << SearchMethod << '\t' << SearchScore << '\t' << AlignmentMethod << '\t' << QueryStart << '\t' << QueryEnd << '\t' << TemplateStart << '\t' << TemplateEnd << '\t' << PairwiseAlignmentLength << '\t' << GapsInQuery << '\t' << GapsInTemplate << '\t' << LongestInsert << '\t' << SimBtwnQueryTemplate << endl; + } + else{ badSeqNames[name] = trashCode; } + count++; + } + in.close(); + out.close(); + + int oldBadSeqsCount = badSeqNames.size(); + + int numFastaSeqs = runFastaScreening(badSeqNames); + + if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns + m->renameFile(outSummary, outSummary+".temp"); + + ofstream out2; + m->openOutputFile(outSummary, out2); + + //read summary file + ifstream in2; + m->openInputFile(outSummary+".temp", in2); + out2 << (m->getline(in2)) << endl; //skip headers + + while (!in2.eof()) { + + if (m->control_pressed) { in2.close(); out2.close(); return 0; } + + //seqname start end nbases ambigs polymer numSeqs + in2 >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in2); + + if (badSeqNames.count(name) == 0) { //are you good? + out2 << name << '\t' << length << '\t' << TemplateName << '\t' << TemplateLength << '\t' << SearchMethod << '\t' << SearchScore << '\t' << AlignmentMethod << '\t' << QueryStart << '\t' << QueryEnd << '\t' << TemplateStart << '\t' << TemplateEnd << '\t' << PairwiseAlignmentLength << '\t' << GapsInQuery << '\t' << GapsInTemplate << '\t' << LongestInsert << '\t' << SimBtwnQueryTemplate << endl; + } + } + in2.close(); + out2.close(); + m->mothurRemove(outSummary+".temp"); + } + + if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; } + + + return count; + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport"); + exit(1); + } + +} +//***************************************************************************************************************/ +int ScreenSeqsCommand::screenContigs(map& badSeqNames){ + try{ + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(contigsreport)); + string outSummary = getOutputFileName("contigsreport",variables); + outputNames.push_back(outSummary); outputTypes["contigsreport"].push_back(outSummary); + + string name; + //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns + int length, OLength, thisOStart, thisOEnd, numMisMatches, numNs; + + ofstream out; + m->openOutputFile(outSummary, out); + + //read summary file + ifstream in; + m->openInputFile(contigsreport, in); + out << (m->getline(in)) << endl; //skip headers + + int count = 0; + + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); return 0; } + + //seqname start end nbases ambigs polymer numSeqs + in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in); + + bool goodSeq = 1; // innocent until proven guilty + string trashCode = ""; + if(oStart != -1 && oStart < thisOStart) { goodSeq = 0; trashCode += "ostart|"; } + if(oEnd != -1 && oEnd > thisOEnd) { goodSeq = 0; trashCode += "oend|"; } + if(maxN != -1 && maxN < numNs) { goodSeq = 0; trashCode += "n|"; } + if(minOverlap != -1 && minOverlap > OLength) { goodSeq = 0; trashCode += "olength|"; } + if(mismatches != -1 && mismatches < numMisMatches) { goodSeq = 0; trashCode += "mismatches|"; } + + if(goodSeq == 1){ + out << name << '\t' << length << '\t' << OLength << '\t' << thisOStart << '\t' << thisOEnd << '\t' << numMisMatches << '\t' << numNs << endl; + } + else{ badSeqNames[name] = trashCode; } + count++; + } + in.close(); + out.close(); + + int oldBadSeqsCount = badSeqNames.size(); + + int numFastaSeqs = runFastaScreening(badSeqNames); + + if (oldBadSeqsCount != badSeqNames.size()) { //more seqs were removed by maxns + m->renameFile(outSummary, outSummary+".temp"); + + ofstream out2; + m->openOutputFile(outSummary, out2); + + //read summary file + ifstream in2; + m->openInputFile(outSummary+".temp", in2); + out2 << (m->getline(in2)) << endl; //skip headers + + while (!in2.eof()) { + + if (m->control_pressed) { in2.close(); out2.close(); return 0; } + + //seqname start end nbases ambigs polymer numSeqs + in2 >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numNs; m->gobble(in2); + + if (badSeqNames.count(name) == 0) { //are you good? + out2 << name << '\t' << length << '\t' << OLength << '\t' << thisOStart << '\t' << thisOEnd << '\t' << numMisMatches << '\t' << numNs << endl; + } + } + in2.close(); + out2.close(); + m->mothurRemove(outSummary+".temp"); + } + + if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; } + + + return count; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenContigs"); + exit(1); + } +} +//***************************************************************************************************************/ +int ScreenSeqsCommand::screenSummary(map& badSeqNames){ + try{ + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(summaryfile)); + string outSummary = getOutputFileName("summary",variables); + outputNames.push_back(outSummary); outputTypes["summary"].push_back(outSummary); + + string name; + int start, end, length, ambigs, polymer, numReps; + + ofstream out; + m->openOutputFile(outSummary, out); + + //read summary file + ifstream in; + m->openInputFile(summaryfile, in); + out << (m->getline(in)) << endl; //skip headers + + int count = 0; + + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); return 0; } + + //seqname start end nbases ambigs polymer numSeqs + in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in); + + bool goodSeq = 1; // innocent until proven guilty + string trashCode = ""; + if(startPos != -1 && startPos < start) { goodSeq = 0; trashCode += "start|"; } + if(endPos != -1 && endPos > end) { goodSeq = 0; trashCode += "end|"; } + if(maxAmbig != -1 && maxAmbig < ambigs) { goodSeq = 0; trashCode += "ambig|"; } + if(maxHomoP != -1 && maxHomoP < polymer) { goodSeq = 0; trashCode += "homop|"; } + if(minLength != -1 && minLength > length) { goodSeq = 0; trashCode += "renameFile(outSummary, outSummary+".temp"); + + ofstream out2; + m->openOutputFile(outSummary, out2); + + //read summary file + ifstream in2; + m->openInputFile(outSummary+".temp", in2); + out2 << (m->getline(in2)) << endl; //skip headers + + while (!in2.eof()) { + + if (m->control_pressed) { in2.close(); out2.close(); return 0; } + + //seqname start end nbases ambigs polymer numSeqs + in2 >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in2); + + if (badSeqNames.count(name) == 0) { //are you good? + out2 << name << '\t' << start << '\t' << end << '\t' << length << '\t' << ambigs << '\t' << polymer << '\t' << numReps << endl; + } + } + in2.close(); + out2.close(); + m->mothurRemove(outSummary+".temp"); + } + + if (numFastaSeqs != count) { m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your summary file, quitting.\n"); m->control_pressed = true; } + + + + return count; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenSummary"); + exit(1); + } +} +//***************************************************************************************************************/ +int ScreenSeqsCommand::screenFasta(map& badSeqNames){ + try{ + + + //if the user want to optimize we need to know the 90% mark + vector positions; + if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here + //use the namefile to optimize correctly + if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } + getSummary(positions); + }else { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + positions = m->divideFile(fastafile, processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } +#else + if(processors == 1){ lines.push_back(linePair(0, 1000)); } + else { + int numFastaSeqs = 0; + positions = m->setFilePosFasta(fastafile, numFastaSeqs); + if (positions.size() < processors) { processors = positions.size(); } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); + } + } +#endif + } + + if (m->control_pressed) { return 0; } + + int numFastaSeqs = runFastaScreening(badSeqNames); + + return numFastaSeqs; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenFasta"); + exit(1); + } +} +//*************************************************************************************************************** + +int ScreenSeqsCommand::screenNameGroupFile(map badSeqNames){ + try { + ifstream inputNames; + m->openInputFile(namefile, inputNames); + map badSeqGroups; + string seqName, seqList, group; + map::iterator it; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile)); + variables["[extension]"] = m->getExtension(namefile); + string goodNameFile = getOutputFileName("name", variables); + outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile); + + ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut); + + while(!inputNames.eof()){ + if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; } + + inputNames >> seqName; m->gobble(inputNames); inputNames >> seqList; + it = badSeqNames.find(seqName); + + if(it != badSeqNames.end()){ + badSeqNames.erase(it); + + if(namefile != ""){ + int start = 0; + for(int i=0;isecond; + start = i+1; + } + } + badSeqGroups[seqList.substr(start,seqList.length()-start)] = it->second; + } + } + else{ + goodNameOut << seqName << '\t' << seqList << endl; + } + m->gobble(inputNames); + } + inputNames.close(); + goodNameOut.close(); + + //we were unable to remove some of the bad sequences + if (badSeqNames.size() != 0) { + for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { + m->mothurOut("Your namefile does not include the sequence " + it->first + " please correct."); + m->mothurOutEndLine(); + } + } + + if(groupfile != ""){ + + ifstream inputGroups; + m->openInputFile(groupfile, inputGroups); + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile)); + variables["[extension]"] = m->getExtension(groupfile); + string goodGroupFile = getOutputFileName("group", variables); + + outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile); + + ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut); + + while(!inputGroups.eof()){ + if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile); m->mothurRemove(goodGroupFile); return 0; } + + inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group; + + it = badSeqGroups.find(seqName); + + if(it != badSeqGroups.end()){ + badSeqGroups.erase(it); + } + else{ + goodGroupOut << seqName << '\t' << group << endl; + } + m->gobble(inputGroups); + } + inputGroups.close(); + goodGroupOut.close(); + + //we were unable to remove some of the bad sequences + if (badSeqGroups.size() != 0) { + for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) { + m->mothurOut("Your groupfile does not include the sequence " + it->first + " please correct."); + m->mothurOutEndLine(); + } + } + } + + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile"); + exit(1); + } +} +//*************************************************************************************************************** +int ScreenSeqsCommand::getSummaryReport(){ + try { + + vector startPosition; + vector endPosition; + vector seqLength; + vector ambigBases; + vector longHomoPolymer; + +#ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + if (pid == 0) { +#endif + + + //read summary file + ifstream in; + m->openInputFile(summaryfile, in); + m->getline(in); + + string name; + int start, end, length, ambigs, polymer, numReps; + + while (!in.eof()) { + + if (m->control_pressed) { in.close(); return 0; } + + //seqname start end nbases ambigs polymer numSeqs + in >> name >> start >> end >> length >> ambigs >> polymer >> numReps; m->gobble(in); + + int num = 1; + if ((namefile != "") || (countfile !="")) { + //make sure this sequence is in the namefile, else error + map::iterator it = nameMap.find(name); + + if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + startPosition.push_back(start); + endPosition.push_back(end); + seqLength.push_back(length); + ambigBases.push_back(ambigs); + longHomoPolymer.push_back(polymer); + } + + } + in.close(); + + sort(startPosition.begin(), startPosition.end()); + sort(endPosition.begin(), endPosition.end()); + sort(seqLength.begin(), seqLength.end()); + sort(ambigBases.begin(), ambigBases.end()); + sort(longHomoPolymer.begin(), longHomoPolymer.end()); + + //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs + int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100)); + + for (int i = 0; i < optimize.size(); i++) { + if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100)); endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();} + else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); } + } + +#ifdef USE_MPI + } + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + if (pid == 0) { + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&startPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&endPos, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&maxAmbig, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + } + }else { + MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&endPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&maxAmbig, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + } + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case +#endif + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "getSummaryReport"); + exit(1); + } +} +//*************************************************************************************************************** +int ScreenSeqsCommand::optimizeContigs(){ + try { + vector olengths; + vector oStarts; + vector oEnds; + vector numMismatches; + vector numNs; + + vector positions; + vector contigsLines; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + positions = m->divideFilePerLine(contigsreport, processors); + for (int i = 0; i < (positions.size()-1); i++) { contigsLines.push_back(linePair(positions[i], positions[(i+1)])); } +#else + if(processors == 1){ contigsLines.push_back(linePair(0, 1000)); } + else { + int numContigsSeqs = 0; + positions = m->setFilePosEachLine(contigsreport, numContigsSeqs); + if (positions.size() < processors) { processors = positions.size(); } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numContigsSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numContigsSeqs - i * numSeqsPerProcessor; } + contigsLines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); + } + } +#endif + +#ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + if (pid == 0) { + driverContigsSummary(olengths, oStarts, oEnds, numMismatches, numNs, contigsLines[0]); +#else + createProcessesContigsSummary(olengths, oStarts, oEnds, numMismatches, numNs, contigsLines); + + if (m->control_pressed) { return 0; } +#endif + sort(olengths.begin(), olengths.end()); + sort(oStarts.begin(), oStarts.end()); + sort(oEnds.begin(), oEnds.end()); + sort(numMismatches.begin(), numMismatches.end()); + sort(numNs.begin(), numNs.end()); + + //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs + int criteriaPercentile = int(oStarts.size() * (criteria / (float) 100)); + + for (int i = 0; i < optimize.size(); i++) { + if (optimize[i] == "ostart") { oStart = oStarts[criteriaPercentile]; m->mothurOut("Optimizing ostart to " + toString(oStart) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "oend") { int endcriteriaPercentile = int(oEnds.size() * ((100 - criteria) / (float) 100)); oEnd = oEnds[endcriteriaPercentile]; m->mothurOut("Optimizing oend to " + toString(oEnd) + "."); m->mothurOutEndLine();} + else if (optimize[i] == "mismatches") { mismatches = numMismatches[criteriaPercentile]; m->mothurOut("Optimizing mismatches to " + toString(mismatches) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "maxn") { maxN = numNs[criteriaPercentile]; m->mothurOut("Optimizing maxn to " + toString(maxN) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "minoverlap") { int mincriteriaPercentile = int(olengths.size() * ((100 - criteria) / (float) 100)); minOverlap = olengths[mincriteriaPercentile]; m->mothurOut("Optimizing minoverlap to " + toString(minOverlap) + "."); m->mothurOutEndLine(); } + + } + +#ifdef USE_MPI + } + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + if (pid == 0) { + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&minOverlap, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&oStart, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&oEnd, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&mismatches, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&maxN, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + } + }else { + MPI_Recv(&minOverlap, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&oStart, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&oEnd, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&mismatches, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&maxN, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + } + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case +#endif + return 0; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "optimizeContigs"); + exit(1); + } +} +/**************************************************************************************/ +int ScreenSeqsCommand::driverContigsSummary(vector& oLength, vector& ostartPosition, vector& oendPosition, vector& omismatches, vector& numNs, linePair filePos) { + try { + + string name; + //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns + int length, OLength, thisOStart, thisOEnd, numMisMatches, numns; + + ifstream in; + m->openInputFile(contigsreport, in); + + in.seekg(filePos.start); + if (filePos.start == 0) { //read headers + m->getline(in); m->gobble(in); + } + + bool done = false; + int count = 0; + + while (!done) { + + if (m->control_pressed) { in.close(); return 1; } + + //seqname start end nbases ambigs polymer numSeqs + in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; m->gobble(in); + + int num = 1; + if ((namefile != "") || (countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = nameMap.find(name); + + if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + ostartPosition.push_back(thisOStart); + oendPosition.push_back(thisOEnd); + oLength.push_back(OLength); + omismatches.push_back(numMisMatches); + numNs.push_back(numns); + } + + count++; + + //if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); } +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + unsigned long long pos = in.tellg(); + if ((pos == -1) || (pos >= filePos.end)) { break; } +#else + if (in.eof()) { break; } +#endif + } + + in.close(); + + return count; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "driverContigsSummary"); + exit(1); + } +} + +/**************************************************************************************************/ +int ScreenSeqsCommand::createProcessesContigsSummary(vector& oLength, vector& ostartPosition, vector& oendPosition, vector& omismatches, vector& numNs, vector contigsLines) { + try { + + int process = 1; + int num = 0; + vector processIDS; + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[process]); + + //pass numSeqs to parent + ofstream out; + string tempFile = contigsreport + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + + out << num << endl; + out << ostartPosition.size() << endl; + for (int k = 0; k < ostartPosition.size(); k++) { out << ostartPosition[k] << '\t'; } out << endl; + for (int k = 0; k < oendPosition.size(); k++) { out << oendPosition[k] << '\t'; } out << endl; + for (int k = 0; k < oLength.size(); k++) { out << oLength[k] << '\t'; } out << endl; + for (int k = 0; k < omismatches.size(); k++) { out << omismatches[k] << '\t'; } out << endl; + for (int k = 0; k < numNs.size(); k++) { out << numNs[k] << '\t'; } out << endl; + + out.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + num = driverContigsSummary(oLength, ostartPosition, oendPosition, omismatches, numNs, contigsLines[0]); + + //force parent to wait until all the processes are done + for (int i=0;iopenInputFile(tempFilename, in); + + int temp, tempNum; + in >> tempNum; m->gobble(in); num += tempNum; + in >> tempNum; m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; ostartPosition.push_back(temp); } m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; oendPosition.push_back(temp); } m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; oLength.push_back(temp); } m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; omismatches.push_back(temp); } m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; numNs.push_back(temp); } m->gobble(in); + + in.close(); + m->mothurRemove(tempFilename); + } + + +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the seqSumData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to allow both threads to add info to vectors. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + /* + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; icount; + if (pDataArray[i]->count != pDataArray[i]->end) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + } + for (int k = 0; k < pDataArray[i]->ostartPosition.size(); k++) { ostartPosition.push_back(pDataArray[i]->ostartPosition[k]); } + for (int k = 0; k < pDataArray[i]->oendPosition.size(); k++) { oendPosition.push_back(pDataArray[i]->oendPosition[k]); } + for (int k = 0; k < pDataArray[i]->oLength.size(); k++) { oLength.push_back(pDataArray[i]->oLength[k]); } + for (int k = 0; k < pDataArray[i]->omismatches.size(); k++) { omismatches.push_back(pDataArray[i]->omismatches[k]); } + for (int k = 0; k < pDataArray[i]->numNs.size(); k++) { numNs.push_back(pDataArray[i]->numNs[k]); } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + */ +#endif + return num; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "createProcessesContigsSummary"); + exit(1); + } +} +//*************************************************************************************************************** +int ScreenSeqsCommand::optimizeAlign(){ + try { + + vector sims; + vector scores; + vector inserts; + + vector positions; + vector alignLines; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + positions = m->divideFilePerLine(alignreport, processors); + for (int i = 0; i < (positions.size()-1); i++) { alignLines.push_back(linePair(positions[i], positions[(i+1)])); } +#else + if(processors == 1){ alignLines.push_back(linePair(0, 1000)); } + else { + int numAlignSeqs = 0; + positions = m->setFilePosEachLine(alignreport, numAlignSeqs); + if (positions.size() < processors) { processors = positions.size(); } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numAlignSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numAlignSeqs - i * numSeqsPerProcessor; } + alignLines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); + } + } +#endif + +#ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + + if (pid == 0) { + driverAlignSummary(sims, scores, inserts, alignLines[0]); +#else + createProcessesAlignSummary(sims, scores, inserts, alignLines); + + if (m->control_pressed) { return 0; } +#endif + sort(sims.begin(), sims.end()); + sort(scores.begin(), scores.end()); + sort(inserts.begin(), inserts.end()); + + //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs + int criteriaPercentile = int(sims.size() * (criteria / (float) 100)); + + for (int i = 0; i < optimize.size(); i++) { + if (optimize[i] == "minsim") { int mincriteriaPercentile = int(sims.size() * ((100 - criteria) / (float) 100)); minSim = sims[mincriteriaPercentile]; m->mothurOut("Optimizing minsim to " + toString(minSim) + "."); m->mothurOutEndLine();} + else if (optimize[i] == "minscore") { int mincriteriaPercentile = int(scores.size() * ((100 - criteria) / (float) 100)); minScore = scores[mincriteriaPercentile]; m->mothurOut("Optimizing minscore to " + toString(minScore) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "maxinsert") { maxInsert = inserts[criteriaPercentile]; m->mothurOut("Optimizing maxinsert to " + toString(maxInsert) + "."); m->mothurOutEndLine(); } + } + +#ifdef USE_MPI + } + + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + if (pid == 0) { + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&minSim, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&minScore, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&maxInsert, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + } + }else { + MPI_Recv(&minSim, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&minScore, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&maxInsert, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); } - - m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); - m->mothurOutEndLine(); - + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case +#endif return 0; } catch(exception& e) { - m->errorOut(e, "ScreenSeqsCommand", "execute"); + m->errorOut(e, "ScreenSeqsCommand", "optimizeContigs"); exit(1); } } - -//*************************************************************************************************************** - -int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ +/**************************************************************************************/ +int ScreenSeqsCommand::driverAlignSummary(vector& sims, vector& scores, vector& inserts, linePair filePos) { try { - ifstream inputNames; - m->openInputFile(namefile, inputNames); - set badSeqGroups; - string seqName, seqList, group; - set::iterator it; - map variables; - variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile)); - variables["[extension]"] = m->getExtension(namefile); - string goodNameFile = getOutputFileName("name", variables); - outputNames.push_back(goodNameFile); outputTypes["name"].push_back(goodNameFile); - ofstream goodNameOut; m->openOutputFile(goodNameFile, goodNameOut); + string name, TemplateName, SearchMethod, AlignmentMethod; + //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template + //checking for minScore, maxInsert, minSim + int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert; + float SearchScore, SimBtwnQueryTemplate; + + ifstream in; + m->openInputFile(alignreport, in); + + in.seekg(filePos.start); + if (filePos.start == 0) { //read headers + m->getline(in); m->gobble(in); + } + + bool done = false; + int count = 0; + + while (!done) { + + if (m->control_pressed) { in.close(); return 1; } + + in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; m->gobble(in); + + int num = 1; + if ((namefile != "") || (countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = nameMap.find(name); + + if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + sims.push_back(SimBtwnQueryTemplate); + scores.push_back(SearchScore); + inserts.push_back(LongestInsert); + } + + count++; + + //if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); } +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + unsigned long long pos = in.tellg(); + if ((pos == -1) || (pos >= filePos.end)) { break; } +#else + if (in.eof()) { break; } +#endif + } - while(!inputNames.eof()){ - if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; } + in.close(); + + return count; + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "driverAlignSummary"); + exit(1); + } +} - inputNames >> seqName; m->gobble(inputNames); inputNames >> seqList; - it = badSeqNames.find(seqName); +/**************************************************************************************************/ +int ScreenSeqsCommand::createProcessesAlignSummary(vector& sims, vector& scores, vector& inserts, vector alignLines) { + try { + + int process = 1; + int num = 0; + vector processIDS; + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + num = driverAlignSummary(sims, scores, inserts, alignLines[process]); - if(it != badSeqNames.end()){ - badSeqNames.erase(it); + //pass numSeqs to parent + ofstream out; + string tempFile = alignreport + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); - if(namefile != ""){ - int start = 0; - for(int i=0;igobble(inputNames); - } - inputNames.close(); - goodNameOut.close(); - - //we were unable to remove some of the bad sequences - if (badSeqNames.size() != 0) { - for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { - m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); - m->mothurOutEndLine(); - } - } - - if(groupfile != ""){ - - ifstream inputGroups; - m->openInputFile(groupfile, inputGroups); - variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile)); - variables["[extension]"] = m->getExtension(groupfile); - string goodGroupFile = getOutputFileName("group", variables); - - outputNames.push_back(goodGroupFile); outputTypes["group"].push_back(goodGroupFile); - - ofstream goodGroupOut; m->openOutputFile(goodGroupFile, goodGroupOut); - - while(!inputGroups.eof()){ - if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile); m->mothurRemove(goodGroupFile); return 0; } - - inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group; + out << num << endl; + out << sims.size() << endl; + for (int k = 0; k < sims.size(); k++) { out << sims[k] << '\t'; } out << endl; + for (int k = 0; k < scores.size(); k++) { out << scores[k] << '\t'; } out << endl; + for (int k = 0; k < inserts.size(); k++) { out << inserts[k] << '\t'; } out << endl; - it = badSeqGroups.find(seqName); + out.close(); - if(it != badSeqGroups.end()){ - badSeqGroups.erase(it); - } - else{ - goodGroupOut << seqName << '\t' << group << endl; - } - m->gobble(inputGroups); + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); } - inputGroups.close(); - goodGroupOut.close(); + } + + num = driverAlignSummary(sims, scores, inserts, alignLines[0]); + + //force parent to wait until all the processes are done + for (int i=0;iopenInputFile(tempFilename, in); - //we were unable to remove some of the bad sequences - if (badSeqGroups.size() != 0) { - for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) { - m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); - m->mothurOutEndLine(); - } - } + int temp, tempNum; + float temp2; + in >> tempNum; m->gobble(in); num += tempNum; + in >> tempNum; m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp2; sims.push_back(temp2); } m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp2; scores.push_back(temp2); } m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; inserts.push_back(temp); } m->gobble(in); + + in.close(); + m->mothurRemove(tempFilename); } - return 0; - +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the seqSumData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to allow both threads to add info to vectors. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + /* + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; icount; + if (pDataArray[i]->count != pDataArray[i]->end) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + } + for (int k = 0; k < pDataArray[i]->sims.size(); k++) { sims.push_back(pDataArray[i]->sims[k]); } + for (int k = 0; k < pDataArray[i]->scores.size(); k++) { scores.push_back(pDataArray[i]->scores[k]); } + for (int k = 0; k < pDataArray[i]->inserts.size(); k++) { inserts.push_back(pDataArray[i]->inserts[k]); } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + */ +#endif + return num; } catch(exception& e) { - m->errorOut(e, "ScreenSeqsCommand", "screenNameGroupFile"); + m->errorOut(e, "ScreenSeqsCommand", "createProcessesAlignSummary"); exit(1); } } @@ -678,6 +1818,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ vector seqLength; vector ambigBases; vector longHomoPolymer; + vector numNs; vector positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) @@ -705,27 +1846,24 @@ int ScreenSeqsCommand::getSummary(vector& positions){ MPI_Comm_rank(MPI_COMM_WORLD, &pid); if (pid == 0) { - driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]); + driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]); #else int numSeqs = 0; //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]); + numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]); }else{ - numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile); + numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile); } if (m->control_pressed) { return 0; } - //#else - // numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]); - // if (m->control_pressed) { return 0; } - //#endif #endif sort(startPosition.begin(), startPosition.end()); sort(endPosition.begin(), endPosition.end()); sort(seqLength.begin(), seqLength.end()); sort(ambigBases.begin(), ambigBases.end()); sort(longHomoPolymer.begin(), longHomoPolymer.end()); + sort(numNs.begin(), numNs.end()); //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs int criteriaPercentile = int(startPosition.size() * (criteria / (float) 100)); @@ -737,6 +1875,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); } else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); } else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); } + else if (optimize[i] == "maxn") { maxN = numNs[criteriaPercentile]; m->mothurOut("Optimizing maxn to " + toString(maxN) + "."); m->mothurOutEndLine(); } } #ifdef USE_MPI @@ -755,6 +1894,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ MPI_Send(&maxHomoP, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); MPI_Send(&minLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); MPI_Send(&maxLength, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); + MPI_Send(&maxN, 1, MPI_INT, i, 2001, MPI_COMM_WORLD); } }else { MPI_Recv(&startPos, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); @@ -763,6 +1903,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ MPI_Recv(&maxHomoP, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); MPI_Recv(&minLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); MPI_Recv(&maxLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + MPI_Recv(&maxN, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); } MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #endif @@ -774,7 +1915,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ } } /**************************************************************************************/ -int ScreenSeqsCommand::driverCreateSummary(vector& startPosition, vector& endPosition, vector& seqLength, vector& ambigBases, vector& longHomoPolymer, string filename, linePair filePos) { +int ScreenSeqsCommand::driverCreateSummary(vector& startPosition, vector& endPosition, vector& seqLength, vector& ambigBases, vector& longHomoPolymer, vector& numNs, string filename, linePair filePos) { try { ifstream in; @@ -793,7 +1934,7 @@ int ScreenSeqsCommand::driverCreateSummary(vector& startPosition, vector::iterator it = nameMap.find(current.getName()); @@ -802,12 +1943,14 @@ int ScreenSeqsCommand::driverCreateSummary(vector& startPosition, vector& startPosition, vector& startPosition, vector& endPosition, vector& seqLength, vector& ambigBases, vector& longHomoPolymer, string filename) { +int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, vector& endPosition, vector& seqLength, vector& ambigBases, vector& longHomoPolymer, vector& numNs, string filename) { try { int process = 1; @@ -849,7 +1992,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[process]); + num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[process]); //pass numSeqs to parent ofstream out; @@ -863,6 +2006,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl; for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl; for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl; + for (int k = 0; k < numNs.size(); k++) { out << numNs[k] << '\t'; } out << endl; out.close(); @@ -874,7 +2018,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, } } - num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]); + num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]); //force parent to wait until all the processes are done for (int i=0;i& startPosition, for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } m->gobble(in); for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } m->gobble(in); for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } m->gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; numNs.push_back(temp); } m->gobble(in); in.close(); m->mothurRemove(tempFilename); @@ -917,7 +2062,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, for( int i=0; i& startPosition, } //do your part - num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[processors-1]); + num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[processors-1]); //Wait until all threads have terminated. WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); @@ -942,6 +2087,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]); } for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) { ambigBases.push_back(pDataArray[i]->ambigBases[k]); } for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) { longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]); } + for (int k = 0; k < pDataArray[i]->numNs.size(); k++) { numNs.push_back(pDataArray[i]->numNs[k]); } CloseHandle(hThreadArray[i]); delete pDataArray[i]; } @@ -957,12 +2103,12 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector& startPosition, //*************************************************************************************************************** -int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ +int ScreenSeqsCommand::screenGroupFile(map badSeqNames){ try { ifstream inputGroups; m->openInputFile(groupfile, inputGroups); string seqName, group; - set::iterator it; + map::iterator it; map variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(groupfile)); variables["[extension]"] = m->getExtension(groupfile); @@ -990,7 +2136,7 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ //we were unable to remove some of the bad sequences if (badSeqNames.size() != 0) { for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { - m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); + m->mothurOut("Your groupfile does not include the sequence " + it->first + " please correct."); m->mothurOutEndLine(); } } @@ -1009,11 +2155,11 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ } } //*************************************************************************************************************** -int ScreenSeqsCommand::screenCountFile(set badSeqNames){ +int ScreenSeqsCommand::screenCountFile(map badSeqNames){ try { ifstream in; m->openInputFile(countfile, in); - set::iterator it; + map::iterator it; map variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile)); variables["[extension]"] = m->getExtension(countfile); @@ -1049,7 +2195,7 @@ int ScreenSeqsCommand::screenCountFile(set badSeqNames){ //we were unable to remove some of the bad sequences if (badSeqNames.size() != 0) { for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { - m->mothurOut("Your count file does not include the sequence " + *it + " please correct."); + m->mothurOut("Your count file does not include the sequence " + it->first + " please correct."); m->mothurOutEndLine(); } } @@ -1076,79 +2222,12 @@ int ScreenSeqsCommand::screenCountFile(set badSeqNames){ } //*************************************************************************************************************** -int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ - try { - ifstream inputAlignReport; - m->openInputFile(alignreport, inputAlignReport); - string seqName, group; - set::iterator it; - - map variables; - variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(alignreport)); - string goodAlignReportFile = getOutputFileName("alignreport", variables); - - outputNames.push_back(goodAlignReportFile); outputTypes["alignreport"].push_back(goodAlignReportFile); - ofstream goodAlignReportOut; m->openOutputFile(goodAlignReportFile, goodAlignReportOut); - - while (!inputAlignReport.eof()) { // need to copy header - char c = inputAlignReport.get(); - goodAlignReportOut << c; - if (c == 10 || c == 13){ break; } - } - - while(!inputAlignReport.eof()){ - if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; } - - inputAlignReport >> seqName; - it = badSeqNames.find(seqName); - string line; - while (!inputAlignReport.eof()) { // need to copy header - char c = inputAlignReport.get(); - line += c; - if (c == 10 || c == 13){ break; } - } - - if(it != badSeqNames.end()){ - badSeqNames.erase(it); - } - else{ - goodAlignReportOut << seqName << '\t' << line; - } - m->gobble(inputAlignReport); - } - - if (m->control_pressed) { goodAlignReportOut.close(); inputAlignReport.close(); m->mothurRemove(goodAlignReportFile); return 0; } - - //we were unable to remove some of the bad sequences - if (badSeqNames.size() != 0) { - for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { - m->mothurOut("Your alignreport file does not include the sequence " + *it + " please correct."); - m->mothurOutEndLine(); - } - } - - inputAlignReport.close(); - goodAlignReportOut.close(); - - if (m->control_pressed) { m->mothurRemove(goodAlignReportFile); return 0; } - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "ScreenSeqsCommand", "screenAlignReport"); - exit(1); - } - -} -//*************************************************************************************************************** - -int ScreenSeqsCommand::screenTaxonomy(set badSeqNames){ +int ScreenSeqsCommand::screenTaxonomy(map badSeqNames){ try { ifstream input; m->openInputFile(taxonomy, input); string seqName, tax; - set::iterator it; + map::iterator it; map variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(taxonomy)); variables["[extension]"] = m->getExtension(taxonomy); @@ -1175,7 +2254,7 @@ int ScreenSeqsCommand::screenTaxonomy(set badSeqNames){ //we were unable to remove some of the bad sequences if (badSeqNames.size() != 0) { for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { - m->mothurOut("Your taxonomy file does not include the sequence " + *it + " please correct."); + m->mothurOut("Your taxonomy file does not include the sequence " + it->first + " please correct."); m->mothurOutEndLine(); } } @@ -1196,11 +2275,11 @@ int ScreenSeqsCommand::screenTaxonomy(set badSeqNames){ } //*************************************************************************************************************** -int ScreenSeqsCommand::screenQual(set badSeqNames){ +int ScreenSeqsCommand::screenQual(map badSeqNames){ try { ifstream in; m->openInputFile(qualfile, in); - set::iterator it; + map::iterator it; map variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qualfile)); variables["[extension]"] = m->getExtension(qualfile); @@ -1254,7 +2333,7 @@ int ScreenSeqsCommand::screenQual(set badSeqNames){ //we were unable to remove some of the bad sequences if (badSeqNames.size() != 0) { for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { - m->mothurOut("Your qual file does not include the sequence " + *it + " please correct."); + m->mothurOut("Your qual file does not include the sequence " + it->first + " please correct."); m->mothurOutEndLine(); } } @@ -1272,7 +2351,7 @@ int ScreenSeqsCommand::screenQual(set badSeqNames){ } //********************************************************************************************************************** -int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, set& badSeqNames){ +int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, map& badSeqNames){ try { ofstream goodFile; m->openOutputFile(goodFName, goodFile); @@ -1287,7 +2366,7 @@ int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccn bool done = false; int count = 0; - + while (!done) { if (m->control_pressed) { return 0; } @@ -1295,21 +2374,31 @@ int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccn Sequence currSeq(inFASTA); m->gobble(inFASTA); if (currSeq.getName() != "") { bool goodSeq = 1; // innocent until proven guilty - if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; } - if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; } - if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } - if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; } - if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; } - if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; } + string trashCode = ""; + //have the report files found you bad + map::iterator it = badSeqNames.find(currSeq.getName()); + if (it != badSeqNames.end()) { goodSeq = 0; trashCode = it->second; } + + if (summaryfile == "") { //summaryfile includes these so no need to check again + if(startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; } + if(endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|";} + if(maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|";} + if(maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|";} + if(minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "& MPIPos, set& badSeqNames){ +int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector& MPIPos, map& badSeqNames){ try { string outputString = ""; MPI_Status statusGood; @@ -1368,13 +2457,25 @@ int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& //process seq if (currSeq.getName() != "") { bool goodSeq = 1; // innocent until proven guilty - if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; } - if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; } - if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } - if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; } - if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; } - if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; } + string trashCode = ""; + //have the report files found you bad + map::iterator it = badSeqNames.find(currSeq.getName()); + if (it != badSeqNames.end()) { goodSeq = 0; trashCode = it->second; } + + if (summaryfile == "") { //summaryfile includes these so no need to check again + if(startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; } + if(endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|";} + if(maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|";} + if(maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|";} + if(minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "& badSeqNames) { +int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, map& badSeqNames) { try { vector processIDS; @@ -1478,10 +2579,10 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st if (ableToOpen == 0) { badSeqNames.clear(); - string tempName; + string tempName, trashCode; while (!inBad.eof()) { - inBad >> tempName; m->gobble(inBad); - badSeqNames.insert(tempName); + inBad >> tempName >> trashCode; m->gobble(inBad); + badSeqNames[tempName] = trashCode; } inBad.close(); } @@ -1504,7 +2605,7 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); } // Allocate memory for thread data. - sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension); + sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN, badSeqNames, filename, summaryfile, contigsreport, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension); pDataArray.push_back(tempSum); //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier @@ -1524,7 +2625,7 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st if (pDataArray[i]->count != pDataArray[i]->end) { m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; } - for (set::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); } + for (map::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames[it->first] = it->second; } CloseHandle(hThreadArray[i]); delete pDataArray[i]; } diff --git a/screenseqscommand.h b/screenseqscommand.h index 6f4e8ac..e290690 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -43,30 +43,44 @@ private: vector lines; - int screenNameGroupFile(set); - int screenGroupFile(set); - int screenCountFile(set); - int screenAlignReport(set); - int screenQual(set); - int screenTaxonomy(set); - - int driver(linePair, string, string, string, set&); - int createProcesses(string, string, string, set&); + int screenNameGroupFile(map); + int screenGroupFile(map); + int screenCountFile(map); + int screenAlignReport(map&); + int screenQual(map); + int screenTaxonomy(map); + int optimizeContigs(); + int optimizeAlign(); + int driver(linePair, string, string, string, map&); + int createProcesses(string, string, string, map&); + int screenSummary(map&); + int screenContigs(map&); + int runFastaScreening(map&); + int screenFasta(map&); + int screenReports(map&); + int getSummary(vector&); + int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, vector&, string); + int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, vector&, string, linePair); + int getSummaryReport(); + int driverContigsSummary(vector&, vector&, vector&, vector&, vector&, linePair); + int createProcessesContigsSummary(vector&, vector&, vector&, vector&, vector&, vector); + int driverAlignSummary(vector&, vector&, vector&, linePair); + int createProcessesAlignSummary(vector&, vector&, vector&, vector); + #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&, set&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&, map&); #endif bool abort; - string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile; - int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria; + string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile, contigsreport, summaryfile; + int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria, minOverlap, oStart, oEnd, mismatches, maxN, maxInsert; + float minSim, minScore; vector outputNames; vector optimize; map nameMap; - int getSummary(vector&); - int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, string); - int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, string, linePair); + }; /**************************************************************************************************/ @@ -79,7 +93,8 @@ struct sumData { vector seqLength; vector ambigBases; vector longHomoPolymer; - string filename, namefile; + vector numNs; + string filename, namefile, countfile; unsigned long long start; unsigned long long end; int count; @@ -88,9 +103,65 @@ struct sumData { sumData(){} - sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, map nam) { + sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { + filename = f; + namefile = nf; + countfile = cf; + m = mout; + start = st; + end = en; + nameMap = nam; + count = 0; + } +}; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct contigsSumData { + vector ostartPosition; + vector oendPosition; + vector oLength; + vector omismatches; + vector numNs; + string filename, namefile, countfile; + unsigned long long start; + unsigned long long end; + int count; + MothurOut* m; + map nameMap; + + + contigsSumData(){} + contigsSumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { + filename = f; + namefile = nf; + countfile = cf; + m = mout; + start = st; + end = en; + nameMap = nam; + count = 0; + } +}; +/**************************************************************************************************/ +struct alignsData { + vector sims; + vector scores; + vector inserts; + string filename, namefile, countfile; + unsigned long long start; + unsigned long long end; + int count; + MothurOut* m; + map nameMap; + + + alignsData(){} + alignsData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { filename = f; namefile = nf; + countfile = cf; m = mout; start = st; end = en; @@ -98,34 +169,40 @@ struct sumData { count = 0; } }; + /**************************************************************************************************/ //custom data structure for threads to use. // This is passed by void pointer so it can be any data type // that can be passed using a single void pointer (LPVOID). struct sumScreenData { - int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength; + int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN; unsigned long long start; unsigned long long end; int count; MothurOut* m; string goodFName, badAccnosFName, filename; - set badSeqNames; + map badSeqNames; + string summaryfile, contigsreport; sumScreenData(){} - sumScreenData(int s, int e, int a, int h, int minl, int maxl, string f, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) { + sumScreenData(int s, int e, int a, int h, int minl, int maxl, int mn, map bs, string f, string sum, string cont, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) { startPos = s; endPos = e; minLength = minl; maxLength = maxl; maxAmbig = a; maxHomoP = h; + maxN = mn; filename = f; goodFName = gf; badAccnosFName = bf; m = mout; start = st; end = en; + summaryfile = sum; + contigsreport = cont; + badSeqNames = bs; count = 0; } }; @@ -161,7 +238,7 @@ static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ if (current.getName() != "") { int num = 1; - if (pDataArray->namefile != "") { + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ //make sure this sequence is in the namefile, else error map::iterator it = pDataArray->nameMap.find(current.getName()); @@ -170,12 +247,14 @@ static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ } //for each sequence this sequence represents + int numns = current.getNumNs(); for (int i = 0; i < num; i++) { pDataArray->startPosition.push_back(current.getStartPos()); pDataArray->endPosition.push_back(current.getEndPos()); pDataArray->seqLength.push_back(current.getNumBases()); pDataArray->ambigBases.push_back(current.getAmbigBases()); pDataArray->longHomoPolymer.push_back(current.getLongHomoPolymer()); + pDataArray->numNs.push_back(numns); } } } @@ -192,7 +271,124 @@ static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ } /**************************************************************************************************/ +static DWORD WINAPI MyContigsSumThreadFunction(LPVOID lpParam){ + contigsSumData* pDataArray; + pDataArray = (contigsSumData*)lpParam; + + try { + string name; + //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns + int length, OLength, thisOStart, thisOEnd, numMisMatches, numns; + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + pDataArray->count++; + + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } + + //seqname start end nbases ambigs polymer numSeqs + in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; pDataArray->m->gobble(in); + + int num = 1; + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = pDataArray->nameMap.find(name); + + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + pDataArray->ostartPosition.push_back(thisOStart); + pDataArray->oendPosition.push_back(thisOEnd); + pDataArray->oLength.push_back(OLength); + pDataArray->omismatches.push_back(numMisMatches); + pDataArray->numNs.push_back(numns); + } + } + + in.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyContigsThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ +static DWORD WINAPI MyAlignsThreadFunction(LPVOID lpParam){ + alignsData* pDataArray; + pDataArray = (alignsData*)lpParam; + + try { + + string name, TemplateName, SearchMethod, AlignmentMethod; + //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template + //checking for minScore, maxInsert, minSim + int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert; + float SearchScore, SimBtwnQueryTemplate; + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + pDataArray->count++; + + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } + + in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; pDataArray->m->gobble(in); + cout << i << '\t' << name << endl; + int num = 1; + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = pDataArray->nameMap.find(name); + + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + pDataArray->sims.push_back(SimBtwnQueryTemplate); + pDataArray->scores.push_back(SearchScore); + pDataArray->inserts.push_back(LongestInsert); + } + } + + in.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyAlignsThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){ sumScreenData* pDataArray; pDataArray = (sumScreenData*)lpParam; @@ -214,7 +410,7 @@ static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){ }else { //this accounts for the difference in line endings. in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); } - + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process pDataArray->count++; @@ -225,19 +421,30 @@ static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){ if (currSeq.getName() != "") { bool goodSeq = 1; // innocent until proven guilty - if(goodSeq == 1 && pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; } + string trashCode = ""; + //have the report files found you bad + map::iterator it = pDataArray->badSeqNames.find(currSeq.getName()); + if (it != pDataArray->badSeqNames.end()) { goodSeq = 0; trashCode = it->second; } //found it + + if (pDataArray->summaryfile == "") { + if(pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; } + if(pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|"; } + if(pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|"; } + if(pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|"; } + if(pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; trashCode += ">length|"; } + } + if (pDataArray->contigsreport == "") { //contigs report includes this so no need to check again + if(pDataArray->maxN != -1 && pDataArray->maxN < currSeq.getNumNs()) { goodSeq = 0; trashCode += "n|"; } + } + if(goodSeq == 1){ currSeq.printSequence(goodFile); } else{ - badAccnosFile << currSeq.getName() << endl; - pDataArray->badSeqNames.insert(currSeq.getName()); + badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl; + pDataArray->badSeqNames[currSeq.getName()] = trashCode; } } diff --git a/sequence.cpp b/sequence.cpp index 93ecb70..224ecb1 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -575,6 +575,15 @@ string Sequence::getUnaligned(){ int Sequence::getNumBases(){ return numBases; } +//******************************************************************************************************************** + +int Sequence::getNumNs(){ + int numNs = 0; + for (int i = 0; i < unaligned.length(); i++) { + if(toupper(unaligned[i]) == 'N') { numNs++; } + } + return numNs; +} //******************************************************************************************************************** diff --git a/sequence.hpp b/sequence.hpp index 312e49d..ad3a4b4 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -48,6 +48,7 @@ public: string getPairwise(); string getUnaligned(); string getInlineSeq(); + int getNumNs(); int getNumBases(); int getStartPos(); int getEndPos(); -- 2.39.2