X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=makecontigscommand.cpp;h=0d563fe846d2179ea7ca9565e05122847747d468;hp=32e2d68a3d1ebe51e0acc8fa64b7a499324fd60f;hb=fefd5ee1517abd3bc38b469cb2dffc85a1571c7e;hpb=0bd3a2d33b478f0b09fd6b8ce562e9ab41227535 diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index 32e2d68..0d563fe 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -15,24 +15,27 @@ vector MakeContigsCommand::setParameters(){ CommandParameter prfastq("rfastq", "InputTypes", "", "", "none", "none", "fastqGroup","fasta-qfile",false,false,true); parameters.push_back(prfastq); CommandParameter pfasta("ffasta", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "fastaGroup","fasta",false,false,true); parameters.push_back(pfasta); CommandParameter prfasta("rfasta", "InputTypes", "", "", "none", "none", "none","fastaGroup",false,false,true); parameters.push_back(prfasta); - CommandParameter pfqual("fqfile", "InputTypes", "", "", "none", "none", "qfileGroup","qfile",false,false,true); parameters.push_back(pfqual); - CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","qfile",false,false,true); parameters.push_back(prqual); + CommandParameter pfqual("fqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(pfqual); + CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(prqual); CommandParameter pfile("file", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "none","fasta-qfile",false,false,true); parameters.push_back(pfile); CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos); + CommandParameter pfindex("findex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pfindex); + CommandParameter prindex("rindex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(prindex); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs); -// CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); -// CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); - + CommandParameter preorient("checkorient", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(preorient); CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "","",false,false); parameters.push_back(palign); CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles); + CommandParameter ptrimoverlap("trimoverlap", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptrimoverlap); CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch); CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch); CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen); CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend); - CommandParameter pthreshold("threshold", "Number", "", "40", "", "", "","",false,false); parameters.push_back(pthreshold); + CommandParameter pthreshold("insert", "Number", "", "20", "", "", "","",false,false); parameters.push_back(pthreshold); + CommandParameter pdeltaq("deltaq", "Number", "", "6", "", "", "","",false,false); parameters.push_back(pdeltaq); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "illumina1.8+", "", "", "","",false,false,true); parameters.push_back(pformat); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -49,27 +52,34 @@ vector MakeContigsCommand::setParameters(){ string MakeContigsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. It will also provide new quality files if the fastq or file parameter is used.\n"; + helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. \n"; helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n"; - helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n"; + helpString += "If a forward index or reverse index file is provided barcodes be trimmed, and a group file will be created. The oligos parameter is required if an index file is given.\n"; + helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, findex, rindex, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n"; helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n"; - helpString += "The file parameter is 2 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column. Mothur will process each pair and create a combined fasta and qual file with all the sequences.\n"; + helpString += "The file parameter is 2, 3 or 4 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n"; helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n"; helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n"; helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n"; - helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n"; + helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n"; + helpString += "The findex and rindex parameters are used to provide a forward index and reverse index files to process. \n"; + helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n"; helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; - helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; - helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; + //helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + //helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"; + helpString += "The checkorient parameter will check look for the reverse compliment of the barcode or primer in the sequence. If found the sequence is flipped. The default is false.\n"; + helpString += "The deltaq parameter allows you to specify the delta allowed between quality scores of a mismatched base. For example in the overlap, if deltaq=5 and in the alignment seqA, pos 200 has a quality score of 30 and the same position in seqB has a quality score of 20, you take the base from seqA (30-20 >= 5). If the quality score in seqB is 28 then the base in the consensus will be an N (30-28<5) The default is 6.\n"; helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"; helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"; - helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n"; + helpString += "The insert parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score equal to or below the threshold we eliminate it. Default=20.\n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"; + + helpString += "The trimoverlap parameter allows you to trim the sequences to only the overlapping section. The default is F.\n"; helpString += "The make.contigs command should be in the following format: \n"; helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n"; helpString += "Note: No spaces between parameter labels (i.e. ffastq), '=' and parameters (i.e.yourForwardFastqFile).\n"; @@ -86,9 +96,8 @@ string MakeContigsCommand::getOutputPattern(string type) { string pattern = ""; if (type == "fasta") { pattern = "[filename],[tag],contigs.fasta"; } - else if (type == "qfile") { pattern = "[filename],[tag],contigs.qual"; } else if (type == "group") { pattern = "[filename],[tag],contigs.groups"; } - else if (type == "mismatch") { pattern = "[filename],[tag],contigs.mismatch"; } + else if (type == "report") { pattern = "[filename],[tag],contigs.report"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -105,10 +114,9 @@ MakeContigsCommand::MakeContigsCommand(){ setParameters(); vector tempOutNames; outputTypes["fasta"] = tempOutNames; - outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; - outputTypes["mismatch"] = tempOutNames; - } + outputTypes["report"] = tempOutNames; + } catch(exception& e) { m->errorOut(e, "MakeContigsCommand", "MakeContigsCommand"); exit(1); @@ -117,7 +125,8 @@ MakeContigsCommand::MakeContigsCommand(){ //********************************************************************************************************************** MakeContigsCommand::MakeContigsCommand(string option) { try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; + createFileGroup = false; createOligosGroup = false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -140,13 +149,12 @@ MakeContigsCommand::MakeContigsCommand(string option) { //initialize outputTypes vector tempOutNames; outputTypes["fasta"] = tempOutNames; - outputTypes["qfile"] = tempOutNames; - outputTypes["mismatch"] = tempOutNames; + outputTypes["report"] = tempOutNames; outputTypes["group"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter - string inputDir = validParameter.validFile(parameters, "inputdir", false); + inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } else { string path; @@ -213,6 +221,22 @@ MakeContigsCommand::MakeContigsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["oligos"] = inputDir + it->second; } } + + it = parameters.find("findex"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["findex"] = inputDir + it->second; } + } + + it = parameters.find("rindex"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["rindex"] = inputDir + it->second; } + } } ffastqfile = validParameter.validFile(parameters, "ffastq", true); @@ -261,6 +285,29 @@ MakeContigsCommand::MakeContigsCommand(string option) { else if(oligosfile == "not open") { abort = true; } else { m->setOligosFile(oligosfile); } + findexfile = validParameter.validFile(parameters, "findex", true); + if (findexfile == "not found") { findexfile = ""; } + else if(findexfile == "not open") { abort = true; } + + rindexfile = validParameter.validFile(parameters, "rindex", true); + if (rindexfile == "not found") { rindexfile = ""; } + else if(rindexfile == "not open") { abort = true; } + + if ((rindexfile != "") || (findexfile != "")) { + if (oligosfile == ""){ + oligosfile = m->getOligosFile(); + if (oligosfile != "") { m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter.\n"); } + else { + m->mothurOut("You need to provide an oligos file if you are going to use an index file.\n"); abort = true; + } + } + + //can only use an index file with the fastq parameters not fasta and qual + if ((ffastafile != "") || (rfastafile != "")) { + m->mothurOut("[ERROR]: You can only use an index file with the fastq parameters or the file option.\n"); abort = true; + } + } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; @@ -285,10 +332,13 @@ MakeContigsCommand::MakeContigsCommand(string option) { m->mothurConvert(temp, gapExtend); if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; } - temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "40"; } - m->mothurConvert(temp, threshold); - if ((threshold < 0) || (threshold > 40)) { m->mothurOut("[ERROR]: threshold must be between 0 and 40.\n"); abort=true; } + temp = validParameter.validFile(parameters, "insert", false); if (temp == "not found"){ temp = "20"; } + m->mothurConvert(temp, insert); + if ((insert < 0) || (insert > 40)) { m->mothurOut("[ERROR]: insert must be between 0 and 40.\n"); abort=true; } + temp = validParameter.validFile(parameters, "deltaq", false); if (temp == "not found"){ temp = "6"; } + m->mothurConvert(temp, deltaq); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); @@ -314,9 +364,29 @@ MakeContigsCommand::MakeContigsCommand(string option) { temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = m->isTrue(temp); + + + temp = validParameter.validFile(parameters, "trimoverlap", false); if (temp == "not found") { temp = "F"; } + trimOverlap = m->isTrue(temp); align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; } if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; } + + format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "illumina1.8+"; } + + if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) { + m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine(); + abort=true; + } + + temp = validParameter.validFile(parameters, "checkorient", false); if (temp == "not found") { temp = "F"; } + reorient = m->isTrue(temp); + + //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference. + for (int i = -64; i < 65; i++) { + char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499)); + convertTable.push_back(temp); + } } } @@ -350,75 +420,65 @@ int MakeContigsCommand::execute(){ string compositeGroupFile = getOutputFileName("group",cvars); cvars["[tag]"] = "trim"; string compositeFastaFile = getOutputFileName("fasta",cvars); - string compositeQualFile = getOutputFileName("qfile",cvars); cvars["[tag]"] = "scrap"; string compositeScrapFastaFile = getOutputFileName("fasta",cvars); - string compositeScrapQualFile = getOutputFileName("qfile",cvars); cvars["[tag]"] = ""; - string compositeMisMatchFile = getOutputFileName("mismatch",cvars); + string compositeMisMatchFile = getOutputFileName("report",cvars); if (filesToProcess.size() > 1) { //clear files for append below ofstream outCTFasta, outCTQual, outCSFasta, outCSQual, outCMisMatch; m->openOutputFile(compositeFastaFile, outCTFasta); outCTFasta.close(); m->openOutputFile(compositeScrapFastaFile, outCSFasta); outCSFasta.close(); m->openOutputFile(compositeMisMatchFile, outCMisMatch); outCMisMatch.close(); - m->openOutputFile(compositeQualFile, outCTQual); outCTQual.close(); - m->openOutputFile(compositeScrapQualFile, outCSQual); outCSQual.close(); outputNames.push_back(compositeFastaFile); outputTypes["fasta"].push_back(compositeFastaFile); - outputNames.push_back(compositeQualFile); outputTypes["qfile"].push_back(compositeQualFile); - outputNames.push_back(compositeMisMatchFile); outputTypes["mismatch"].push_back(compositeMisMatchFile); + outputNames.push_back(compositeMisMatchFile); outputTypes["report"].push_back(compositeMisMatchFile); outputNames.push_back(compositeScrapFastaFile); outputTypes["fasta"].push_back(compositeScrapFastaFile); - outputNames.push_back(compositeScrapQualFile); outputTypes["qfile"].push_back(compositeScrapQualFile); } + map totalGroupCounts; + for (int l = 0; l < filesToProcess.size(); l++) { m->mothurOut("\n>>>>>\tProcessing " + filesToProcess[l][0][0] + " (file " + toString(l+1) + " of " + toString(filesToProcess.size()) + ")\t<<<<<\n"); + groupCounts.clear(); + groupMap.clear(); vector > fastaFileNames; - vector > qualFileNames; - createGroup = false; + map uniqueFastaNames;// so we don't add the same groupfile multiple times + createOligosGroup = false; + oligos = new Oligos(); + numBarcodes = 0; numFPrimers= 0; numLinkers= 0; numSpacers = 0; numRPrimers = 0; string outputGroupFileName; map variables; string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir = m->hasPath(filesToProcess[l][0][0]); } variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(filesToProcess[l][0][0])); variables["[tag]"] = ""; - if(oligosfile != ""){ - createGroup = getOligos(fastaFileNames, qualFileNames, variables["[filename]"]); - if (createGroup) { - outputGroupFileName = getOutputFileName("group",variables); - outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); - } + if(oligosfile != ""){ createOligosGroup = getOligos(fastaFileNames, variables["[filename]"], uniqueFastaNames); } + if (createOligosGroup || createFileGroup) { + outputGroupFileName = getOutputFileName("group",variables); } + //give group in file file precedence + if (createFileGroup) { createOligosGroup = false; } + variables["[tag]"] = "trim"; string outFastaFile = getOutputFileName("fasta",variables); - string outQualFile = getOutputFileName("qfile",variables); variables["[tag]"] = "scrap"; string outScrapFastaFile = getOutputFileName("fasta",variables); - string outScrapQualFile = getOutputFileName("qfile",variables); variables["[tag]"] = ""; - string outMisMatchFile = getOutputFileName("mismatch",variables); - outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); - outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile); - if (filesToProcess[l][0][1] != "") { - outputNames.push_back(outQualFile); outputTypes["qfile"].push_back(outQualFile); - outputNames.push_back(outScrapQualFile); outputTypes["qfile"].push_back(outScrapQualFile); - } - outputNames.push_back(outMisMatchFile); outputTypes["mismatch"].push_back(outMisMatchFile); - + string outMisMatchFile = getOutputFileName("report",variables); + m->mothurOut("Making contigs...\n"); - createProcesses(filesToProcess[l], outFastaFile, outQualFile, outScrapFastaFile, outScrapQualFile, outMisMatchFile, fastaFileNames, qualFileNames); - m->mothurOut("Done.\n"); + createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames, l); //remove temp fasta and qual files for (int i = 0; i < processors; i++) { for(int j = 0; j < filesToProcess[l][i].size(); j++) { m->mothurRemove(filesToProcess[l][i][j]); } } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete oligos; return 0; } if(allFiles){ - map uniqueFastaNames;// so we don't add the same groupfile multiple times + // so we don't add the same groupfile multiple times map::iterator it; set namesToRemove; for(int i=0;iisBlank(fastaFileNames[i][j])){ m->mothurRemove(fastaFileNames[i][j]); namesToRemove.insert(fastaFileNames[i][j]); - - if (filesToProcess[l][0][1] != "") { - m->mothurRemove(qualFileNames[i][j]); - namesToRemove.insert(qualFileNames[i][j]); - } - }else{ - it = uniqueFastaNames.find(fastaFileNames[i][j]); - if (it == uniqueFastaNames.end()) { - uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i]; - } + uniqueFastaNames.erase(fastaFileNames[i][j]); //remove from list for group file print } } } @@ -454,22 +505,22 @@ int MakeContigsCommand::execute(){ m->openInputFile(it->first, in); ofstream out; - string thisGroupName = thisOutputDir + m->getRootName(m->getSimpleName(it->first)); - thisGroupName += getOutputFileName("group",variables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(it->first)); + string thisGroupName = getOutputFileName("group",variables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); m->openOutputFile(thisGroupName, out); while (!in.eof()){ if (m->control_pressed) { break; } Sequence currSeq(in); m->gobble(in); - out << currSeq.getName() << '\t' << it->second << endl; + out << currSeq.getName() << '\t' << it->second << endl; } - in.close(); out.close(); + in.close(); } } - if (createGroup) { + if (createFileGroup || createOligosGroup) { ofstream outGroup; m->openOutputFile(outputGroupFileName, outGroup); for (map::iterator itGroup = groupMap.begin(); itGroup != groupMap.end(); itGroup++) { @@ -479,21 +530,48 @@ int MakeContigsCommand::execute(){ } if (filesToProcess.size() > 1) { //merge into large combo files - if (createGroup) { - if (l == 0) { + if (createFileGroup || createOligosGroup) { + if (l == 0) { ofstream outCGroup; m->openOutputFile(compositeGroupFile, outCGroup); outCGroup.close(); outputNames.push_back(compositeGroupFile); outputTypes["group"].push_back(compositeGroupFile); } - m->appendFiles(outputGroupFileName, compositeGroupFile); + m->appendFiles(outputGroupFileName, compositeGroupFile); + if (!allFiles) { m->mothurRemove(outputGroupFileName); } + else { outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } + + for (map::iterator itGroups = groupCounts.begin(); itGroups != groupCounts.end(); itGroups++) { + map::iterator itTemp = totalGroupCounts.find(itGroups->first); + if (itTemp == totalGroupCounts.end()) { totalGroupCounts[itGroups->first] = itGroups->second; } //new group create it in totalGroups + else { itTemp->second += itGroups->second; } //existing group, update total + } } - m->appendFiles(outMisMatchFile, compositeMisMatchFile); + if (l == 0) { m->appendFiles(outMisMatchFile, compositeMisMatchFile); } + else { m->appendFilesWithoutHeaders(outMisMatchFile, compositeMisMatchFile); } m->appendFiles(outFastaFile, compositeFastaFile); - m->appendFiles(outQualFile, compositeQualFile); m->appendFiles(outScrapFastaFile, compositeScrapFastaFile); - m->appendFiles(outScrapQualFile, compositeScrapQualFile); + if (!allFiles) { + m->mothurRemove(outMisMatchFile); + m->mothurRemove(outFastaFile); + m->mothurRemove(outScrapFastaFile); + }else { + outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); + outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile); + outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile); + } + }else { + totalGroupCounts = groupCounts; + outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); + outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile); + outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile); + if (createFileGroup || createOligosGroup) { + outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); + } } + m->mothurOut("Done.\n"); + delete oligos; } + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(numReads) + " sequences.\n"); if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -501,8 +579,8 @@ int MakeContigsCommand::execute(){ //output group counts m->mothurOutEndLine(); int total = 0; - if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); } - for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { + if (totalGroupCounts.size() != 0) { m->mothurOut("Group count: \n"); } + for (map::iterator it = totalGroupCounts.begin(); it != totalGroupCounts.end(); it++) { total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine(); } if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); } @@ -515,12 +593,6 @@ int MakeContigsCommand::execute(){ if ((itTypes->second).size() != 0) { currentFasta = (itTypes->second)[0]; m->setFastaFile(currentFasta); } } - string currentQual = ""; - itTypes = outputTypes.find("qfile"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { currentQual = (itTypes->second)[0]; m->setQualFile(currentQual); } - } - string currentGroup = ""; itTypes = outputTypes.find("group"); if (itTypes != outputTypes.end()) { @@ -546,7 +618,7 @@ vector< vector< vector > > MakeContigsCommand::preProcessData(unsigned l vector< vector< vector > > filesToProcess; if (ffastqfile != "") { //reading one file - vector< vector > files = readFastqFiles(numReads, ffastqfile, rfastqfile); + vector< vector > files = readFastqFiles(numReads, ffastqfile, rfastqfile, findexfile, rindexfile); //adjust for really large processors or really small files if (numReads == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; } if (numReads < processors) { @@ -567,12 +639,20 @@ vector< vector< vector > > MakeContigsCommand::preProcessData(unsigned l if (m->control_pressed) { for (int l = 0; l < filesToProcess.size(); l++) { for (int k = 0; k < filesToProcess[l].size(); k++) { for(int j = 0; j < filesToProcess[l][k].size(); j++) { m->mothurRemove(filesToProcess[l][k][j]); } filesToProcess[l][k].clear(); } return filesToProcess; } } unsigned long int thisFilesReads; - vector< vector > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1]); + vector< vector > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1], filePairsToProcess[i][2], filePairsToProcess[i][3]); //adjust for really large processors or really small files if (thisFilesReads < processors) { m->mothurOut("[ERROR]: " + filePairsToProcess[i][0] + " has less than " + toString(processors) + " good reads, skipping\n"); for (int k = 0; k < files.size(); k++) { for(int j = 0; j < files[k].size(); j++) { m->mothurRemove(files[k][j]); } files[k].clear(); } + //remove from file2Group if necassary + map cFile2Group; + for (map::iterator it = file2Group.begin(); it != file2Group.end(); it++) { + if ((it->first) < i) { cFile2Group[it->first] = it->second; } + else if ((it->first) == i) { } //do nothing, we removed files for i + else { cFile2Group[(it->first-1)] = it->second; } //adjust files because i was removed + } + file2Group = cFile2Group; }else { filesToProcess.push_back(files); numReads += thisFilesReads; @@ -601,23 +681,26 @@ vector< vector< vector > > MakeContigsCommand::preProcessData(unsigned l } } //********************************************************************************************************************** -int MakeContigsCommand::createProcesses(vector< vector > files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector > fastaFileNames, vector > qualFileNames) { +int MakeContigsCommand::createProcesses(vector< vector > files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector > fastaFileNames, int index) { try { int num = 0; vector processIDS; + string group = ""; + map::iterator it = file2Group.find(index); + if (it != file2Group.end()) { group = it->second; } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 0; //loop through and create all the processes you want while (process != processors-1) { - int pid = fork(); + pid_t pid = fork(); if (pid > 0) { processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ vector > tempFASTAFileNames = fastaFileNames; - vector > tempPrimerQualFileNames = qualFileNames; if(allFiles){ ofstream temp; @@ -625,33 +708,25 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o for(int i=0;imothurGetpid(process) + ".temp"; m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); - - if (files[processors-1][1] != "") { - tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp"; - m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); - } } } } } - + num = driver(files[process], - outputFasta + toString(getpid()) + ".temp", - outputQual + toString(getpid()) + ".temp", - outputScrapFasta + toString(getpid()) + ".temp", - outputScrapQual + toString(getpid()) + ".temp", - outputMisMatches + toString(getpid()) + ".temp", - tempFASTAFileNames, - tempPrimerQualFileNames); + outputFasta + m->mothurGetpid(process) + ".temp", + outputScrapFasta + m->mothurGetpid(process) + ".temp", + outputMisMatches + m->mothurGetpid(process) + ".temp", + tempFASTAFileNames, process, group); //pass groupCounts to parent ofstream out; - string tempFile = toString(getpid()) + ".num.temp"; + string tempFile = m->mothurGetpid(process) + ".num.temp"; m->openOutputFile(tempFile, out); out << num << endl; - if(createGroup){ + if (createFileGroup || createOligosGroup) { out << groupCounts.size() << endl; for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { @@ -676,13 +751,9 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o ofstream temp; m->openOutputFile(outputFasta, temp); temp.close(); m->openOutputFile(outputScrapFasta, temp); temp.close(); - if (files[processors-1][1] != "") { - m->openOutputFile(outputScrapQual, temp); temp.close(); - m->openOutputFile(outputQual, temp); temp.close(); - } - + //do my part - num = driver(files[processors-1], outputFasta, outputQual, outputScrapFasta, outputScrapQual, outputMisMatches, fastaFileNames, qualFileNames); + num = driver(files[processors-1], outputFasta, outputScrapFasta, outputMisMatches, fastaFileNames, processors-1, group); //force parent to wait until all the processes are done for (int i=0;i > files, string o int tempNum; in >> tempNum; num += tempNum; m->gobble(in); - if(createGroup){ + if (createFileGroup || createOligosGroup) { string group; in >> tempNum; m->gobble(in); @@ -741,8 +812,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o string extension = ""; if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); } vector > tempFASTAFileNames = fastaFileNames; - vector > tempPrimerQualFileNames = qualFileNames; - + if(allFiles){ ofstream temp; @@ -751,25 +821,18 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o if (tempFASTAFileNames[i][j] != "") { tempFASTAFileNames[i][j] += extension; m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); - - if (files[processors-1][1] != "") { - tempPrimerQualFileNames[i][j] += extension; - m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); - } } } } } - - - contigsData* tempcontig = new contigsData(files[h], (outputFasta + extension), (outputQual + extension), (outputScrapFasta + extension), (outputScrapQual + extension),(outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, threshold, barcodes, primers, tempFASTAFileNames, tempPrimerQualFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createGroup, allFiles, h); + + contigsData* tempcontig = new contigsData(group, files[h], (outputFasta + extension), (outputScrapFasta + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, insert, deltaq, tempFASTAFileNames, oligosfile, reorient, pdiffs, bdiffs, tdiffs, createOligosGroup, createFileGroup, allFiles, trimOverlap, h); pDataArray.push_back(tempcontig); hThreadArray[h] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]); } vector > tempFASTAFileNames = fastaFileNames; - vector > tempPrimerQualFileNames = qualFileNames; if(allFiles){ ofstream temp; @@ -780,11 +843,6 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o if (tempFASTAFileNames[i][j] != "") { tempFASTAFileNames[i][j] += extension; m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); - - if (files[processors-1][1] != "") { - tempPrimerQualFileNames[i][j] += extension; - m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); - } } } } @@ -794,14 +852,10 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o ofstream temp; m->openOutputFile(outputFasta, temp); temp.close(); m->openOutputFile(outputScrapFasta, temp); temp.close(); - if (files[processors-1][1] != "") { - m->openOutputFile(outputScrapQual, temp); temp.close(); - m->openOutputFile(outputQual, temp); temp.close(); - } //do my part processIDS.push_back(processors-1); - num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputQual+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputScrapQual+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames); + num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, processors-1, group); //Wait until all threads have terminated. WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); @@ -809,6 +863,9 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o //Close all thread handles and free memory allocations. for(int i=0; i < pDataArray.size(); i++){ num += pDataArray[i]->count; + if (!pDataArray[i]->done) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of sequences assigned to it, quitting. \n"); m->control_pressed = true; + } for (map::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) { map::iterator it2 = groupCounts.find(it->first); if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; } @@ -831,16 +888,8 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o m->appendFiles((outputScrapFasta + toString(processIDS[i]) + ".temp"), outputScrapFasta); m->mothurRemove((outputScrapFasta + toString(processIDS[i]) + ".temp")); - - if (files[processors-1][1] != "") { - m->appendFiles((outputScrapQual + toString(processIDS[i]) + ".temp"), outputScrapQual); - m->mothurRemove((outputScrapQual + toString(processIDS[i]) + ".temp")); - - m->appendFiles((outputQual + toString(processIDS[i]) + ".temp"), outputQual); - m->mothurRemove((outputQual + toString(processIDS[i]) + ".temp")); - } - m->appendFiles((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches); + m->appendFilesWithoutHeaders((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches); m->mothurRemove((outputMisMatches + toString(processIDS[i]) + ".temp")); if(allFiles){ @@ -849,11 +898,6 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o if (fastaFileNames[j][k] != "") { m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]); m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp")); - - if (files[processors-1][1] != "") { - m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]); - m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp")); - } } } } @@ -868,7 +912,7 @@ int MakeContigsCommand::createProcesses(vector< vector > files, string o } } //********************************************************************************************************************** -int MakeContigsCommand::driver(vector files, string outputFasta, string outputQual, string outputScrapFasta, string outputScrapQual, string outputMisMatches, vector > fastaFileNames, vector > qualFileNames){ +int MakeContigsCommand::driver(vector files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector > fastaFileNames, int process, string group){ try { Alignment* alignment; @@ -880,25 +924,34 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string string thisfqualfile = files[1]; string thisrfastafile = files[2]; string thisrqualfile = files[3]; + string thisfindexfile = files[4]; + string thisrindexfile = files[5]; - if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); } + if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); } - ifstream inFFasta, inRFasta, inFQual, inRQual; - ofstream outFasta, outQual, outMisMatch, outScrapFasta, outScrapQual; + ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex; + ofstream outFasta, outMisMatch, outScrapFasta; m->openInputFile(thisffastafile, inFFasta); m->openInputFile(thisrfastafile, inRFasta); if (thisfqualfile != "") { m->openInputFile(thisfqualfile, inFQual); m->openInputFile(thisrqualfile, inRQual); - m->openOutputFile(outputScrapQual, outScrapQual); - m->openOutputFile(outputQual, outQual); } + + if (thisfindexfile != "") { m->openInputFile(thisfindexfile, inFIndex); } + if (thisrindexfile != "") { m->openInputFile(thisrindexfile, inRIndex); } + m->openOutputFile(outputFasta, outFasta); m->openOutputFile(outputScrapFasta, outScrapFasta); m->openOutputFile(outputMisMatches, outMisMatch); - outMisMatch << "Name\tLength\tMisMatches\n"; + outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n"; - TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, primers, barcodes); + TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, oligos->getPairedPrimers(), oligos->getPairedBarcodes()); + + TrimOligos* rtrimOligos = NULL; + if (reorient) { + rtrimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, oligos->getReorientedPairedPrimers(), oligos->getReorientedPairedBarcodes()); numBarcodes = oligos->getReorientedPairedBarcodes().size(); + } while ((!inFFasta.eof()) && (!inRFasta.eof())) { @@ -916,13 +969,34 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string fQual = new QualityScores(inFQual); m->gobble(inFQual); rQual = new QualityScores(inRQual); m->gobble(inRQual); } + Sequence findexBarcode("findex", "NONE"); Sequence rindexBarcode("rindex", "NONE"); + if (thisfindexfile != "") { + Sequence temp(inFIndex); m->gobble(inFIndex); + findexBarcode.setAligned(temp.getAligned()); + } + + if (thisrindexfile != "") { + Sequence temp(inRIndex); m->gobble(inRIndex); + rindexBarcode.setAligned(temp.getAligned()); + } int barcodeIndex = 0; int primerIndex = 0; - - if(barcodes.size() != 0){ + Sequence savedFSeq(fSeq.getName(), fSeq.getAligned()); Sequence savedRSeq(rSeq.getName(), rSeq.getAligned()); + Sequence savedFindex(findexBarcode.getName(), findexBarcode.getAligned()); Sequence savedRIndex(rindexBarcode.getName(), rindexBarcode.getAligned()); + QualityScores* savedFQual = NULL; QualityScores* savedRQual = NULL; + if (thisfqualfile != "") { + savedFQual = new QualityScores(fQual->getName(), fQual->getQualityScores()); + savedRQual = new QualityScores(rQual->getName(), rQual->getQualityScores()); + } + + if(numBarcodes != 0){ if (thisfqualfile != "") { - success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex); + if ((thisfindexfile != "") || (thisrindexfile != "")) { + success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex); + }else { + success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex); + } }else { success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex); } @@ -930,7 +1004,7 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string else{ currentSeqsDiffs += success; } } - if(primers.size() != 0){ + if(numFPrimers != 0){ if (thisfqualfile != "") { success = trimOligos.stripForward(fSeq, rSeq, *fQual, *rQual, primerIndex); }else { @@ -942,6 +1016,58 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + if (reorient && (trashCode != "")) { //if you failed and want to check the reverse + int thisSuccess = 0; + string thisTrashCode = ""; + int thisCurrentSeqsDiffs = 0; + + int thisBarcodeIndex = 0; + int thisPrimerIndex = 0; + + if(numBarcodes != 0){ + if (thisfqualfile != "") { + if ((thisfindexfile != "") || (thisrindexfile != "")) { + thisSuccess = rtrimOligos->stripBarcode(savedFindex, savedRIndex, *savedFQual, *savedRQual, thisBarcodeIndex); + }else { + thisSuccess = rtrimOligos->stripBarcode(savedFSeq, savedRSeq, *savedFQual, *savedRQual, thisBarcodeIndex); + } + }else { + thisSuccess = rtrimOligos->stripBarcode(savedFSeq, savedRSeq, thisBarcodeIndex); + } + if(thisSuccess > bdiffs) { thisTrashCode += 'b'; } + else{ thisCurrentSeqsDiffs += thisSuccess; } + } + + if(numFPrimers != 0){ + if (thisfqualfile != "") { + thisSuccess = rtrimOligos->stripForward(savedFSeq, savedRSeq, *savedFQual, *savedRQual, thisPrimerIndex); + }else { + thisSuccess = rtrimOligos->stripForward(savedFSeq, savedRSeq, thisPrimerIndex); + } + if(thisSuccess > pdiffs) { thisTrashCode += 'f'; } + else{ thisCurrentSeqsDiffs += thisSuccess; } + } + + if (thisCurrentSeqsDiffs > tdiffs) { thisTrashCode += 't'; } + + if (thisTrashCode == "") { + trashCode = thisTrashCode; + success = thisSuccess; + currentSeqsDiffs = thisCurrentSeqsDiffs; + barcodeIndex = thisBarcodeIndex; + primerIndex = thisPrimerIndex; + savedFSeq.reverseComplement(); + savedRSeq.reverseComplement(); + fSeq.setAligned(savedFSeq.getAligned()); + rSeq.setAligned(savedRSeq.getAligned()); + if(thisfqualfile != ""){ + savedFQual->flipQScores(); savedRQual->flipQScores(); + fQual->setScores(savedFQual->getScores()); rQual->setScores(savedRQual->getScores()); + } + }else { trashCode += "(" + thisTrashCode + ")"; } + } + + //flip the reverse reads rSeq.reverseComplement(); if (thisfqualfile != "") { rQual->flipQScores(); } @@ -956,7 +1082,6 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string //traverse alignments merging into one contiguous seq string contig = ""; - vector contigScores; int numMismatches = 0; string seq1 = fSeq.getAligned(); string seq2 = rSeq.getAligned(); @@ -964,24 +1089,19 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string if (thisfqualfile != "") { scores1 = fQual->getQualityScores(); scores2 = rQual->getQualityScores(); - delete fQual; delete rQual; + delete fQual; delete rQual; delete savedFQual; delete savedRQual; } // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; } int overlapStart = fSeq.getStartPos(); int seq2Start = rSeq.getStartPos(); + //bigger of the 2 starting positions is the location of the overlapping start if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1 overlapStart = seq2Start; - for (int i = 0; i < overlapStart; i++) { - contig += seq1[i]; - if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); } - } + for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; } }else { //seq1 starts later so take from 0 to overlapStart from seq2 - for (int i = 0; i < overlapStart; i++) { - contig += seq2[i]; - if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); } - } + for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; } } int seq1End = fSeq.getEndPos(); @@ -989,73 +1109,54 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string int overlapEnd = seq1End; if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends + int oStart = contig.length(); + //cout << fSeq.getAligned() << endl; cout << rSeq.getAligned() << endl; for (int i = overlapStart; i < overlapEnd; i++) { + //cout << seq1[i] << ' ' << seq2[i] << ' ' << scores1[ABaseMap[i]] << ' ' << scores2[BBaseMap[i]] << endl; if (seq1[i] == seq2[i]) { //match, add base and choose highest score contig += seq1[i]; - if (thisfqualfile != "") { - contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; } - } - }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below threshold. In that case eliminate base + }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base if (thisfqualfile != "") { - if (scores2[BBaseMap[i]] < threshold) { } // - else { - contig += seq2[i]; - contigScores.push_back(scores2[BBaseMap[i]]); - } + if (scores2[BBaseMap[i]] <= insert) { } // + else { contig += seq2[i]; } }else { contig += seq2[i]; } //with no quality info, then we keep it? - }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below threshold. In that case eliminate base + }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base if (thisfqualfile != "") { - if (scores1[ABaseMap[i]] < threshold) { } // - else { - contig += seq1[i]; - contigScores.push_back(scores1[ABaseMap[i]]); - } + if (scores1[ABaseMap[i]] <= insert) { } // + else { contig += seq1[i]; } }else { contig += seq1[i]; } //with no quality info, then we keep it? }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality if (thisfqualfile != "") { - char c = seq1[i]; - contigScores.push_back(scores1[ABaseMap[i]]); - if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { contigScores[contigScores.size()-1] = scores2[BBaseMap[i]]; c = seq2[i]; } - contig += c; + if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score + char c = seq1[i]; + if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; } + contig += c; + }else { //if no, base becomes n + contig += 'N'; + } numMismatches++; }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch }else { //should never get here m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n"); } } - + int oend = contig.length(); if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2 - for (int i = overlapEnd; i < length; i++) { - contig += seq2[i]; - if (thisfqualfile != "") { contigScores.push_back(scores2[BBaseMap[i]]); } - } + for (int i = overlapEnd; i < length; i++) { contig += seq2[i]; } }else { //seq2 ends before seq1 so take from overlap to length from seq1 - for (int i = overlapEnd; i < length; i++) { - contig += seq1[i]; - if (thisfqualfile != "") { contigScores.push_back(scores1[ABaseMap[i]]); } - } - + for (int i = overlapEnd; i < length; i++) { contig += seq1[i]; } } + //cout << contig << endl; + //exit(1); + if (trimOverlap) { contig = contig.substr(overlapStart-1, oend-oStart); if (contig.length() == 0) { trashCode += "l"; } } if(trashCode.length() == 0){ bool ignore = false; if (m->debug) { m->mothurOut(fSeq.getName()); } - if (createGroup) { - if(barcodes.size() != 0){ - string thisGroup = barcodeNameVector[barcodeIndex]; - if (primers.size() != 0) { - if (primerNameVector[primerIndex] != "") { - if(thisGroup != "") { - thisGroup += "." + primerNameVector[primerIndex]; - }else { - thisGroup = primerNameVector[primerIndex]; - } - } - } - + if (createOligosGroup) { + string thisGroup = oligos->getGroupName(barcodeIndex, primerIndex); if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); } int pos = thisGroup.find("ignore"); @@ -1066,47 +1167,38 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; } else { groupCounts[it->first] ++; } }else { ignore = true; } + }else if (createFileGroup) { + int pos = group.find("ignore"); + if (pos == string::npos) { + groupMap[fSeq.getName()] = group; - } + map::iterator it = groupCounts.find(group); + if (it == groupCounts.end()) { groupCounts[group] = 1; } + else { groupCounts[it->first] ++; } + }else { ignore = true; } } if (m->debug) { m->mothurOut("\n"); } - + if(allFiles && !ignore){ ofstream output; m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output); output << ">" << fSeq.getName() << endl << contig << endl; output.close(); - - if (thisfqualfile != "") { - m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output); - output << ">" << fSeq.getName() << endl; - for (int i = 0; i < contigScores.size(); i++) { output << contigScores[i] << ' '; } - output << endl; - output.close(); - } } //output outFasta << ">" << fSeq.getName() << endl << contig << endl; - if (thisfqualfile != "") { - outQual << ">" << fSeq.getName() << endl; - for (int i = 0; i < contigScores.size(); i++) { outQual << contigScores[i] << ' '; } - outQual << endl; - } - outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << numMismatches << endl; + int numNs = 0; + for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; } } + outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl; }else { //output outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl; - if (thisfqualfile != "") { - outScrapQual << ">" << fSeq.getName() << " | " << trashCode << endl; - for (int i = 0; i < contigScores.size(); i++) { outScrapQual << contigScores[i] << ' '; } - outScrapQual << endl; - } } num++; //report progress - if((num) % 1000 == 0){ m->mothurOut(toString(num)); m->mothurOutEndLine(); } + if((num) % 1000 == 0){ m->mothurOutJustToScreen(toString(num)); m->mothurOutEndLine(); } } //report progress @@ -1120,12 +1212,11 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string if (thisfqualfile != "") { inFQual.close(); inRQual.close(); - outQual.close(); - outScrapQual.close(); } delete alignment; + if (reorient) { delete rtrimOligos; } - if (m->control_pressed) { m->mothurRemove(outputFasta); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches); if (thisfqualfile != "") { m->mothurRemove(outputQual); m->mothurRemove(outputScrapQual); } } + if (m->control_pressed) { m->mothurRemove(outputFasta); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches); } return num; } @@ -1135,11 +1226,11 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string } } //********************************************************************************************************************** -vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq){ +vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq, string findex, string rindex){ try { vector< vector > files; //maps processors number to file pointer - map > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual + map > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual, tempfiles[4] = forwardIndex, [4] = forwardReverse map >::iterator it; //create files to write to @@ -1149,6 +1240,8 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c ofstream* outFQ = new ofstream; temp.push_back(outFQ); ofstream* outRF = new ofstream; temp.push_back(outRF); ofstream* outRQ = new ofstream; temp.push_back(outRQ); + ofstream* outFI = new ofstream; temp.push_back(outFI); + ofstream* outRI = new ofstream; temp.push_back(outRI); tempfiles[i] = temp; vector names; @@ -1158,8 +1251,13 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c string rfastafilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rfastatemp"; string fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(ffastq)) + toString(i) + "fqualtemp"; string rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rqualtemp"; + string findexfilename = ""; string rindexfilename = ""; + noneOk = false; //flag to oligos file read that its okay to allow for non paired barcodes + if (findex != "") { findexfilename = thisOutputDir + m->getRootName(m->getSimpleName(findex)) + toString(i) + "findextemp"; m->openOutputFile(findexfilename, *outFI); noneOk = true; } + if (rindex != "") { rindexfilename = thisOutputDir + m->getRootName(m->getSimpleName(rindex)) + toString(i) + "rindextemp"; m->openOutputFile(rindexfilename, *outRI); noneOk = true; } names.push_back(ffastafilename); names.push_back(fqualfilename); names.push_back(rfastafilename); names.push_back(rqualfilename); + names.push_back(findexfilename); names.push_back(rindexfilename); files.push_back(names); m->openOutputFile(ffastafilename, *outFF); @@ -1173,7 +1271,7 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } //remove files for (int i = 0; i < files.size(); i++) { - for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } + for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } } @@ -1183,31 +1281,52 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c ifstream inReverse; m->openInputFile(rfastq, inReverse); + ifstream infIndex, inrIndex; + bool findexIsGood = false; + bool rindexIsGood = false; + if (findex != "") { m->openInputFile(findex, infIndex); findexIsGood = true; } + if (rindex != "") { m->openInputFile(rindex, inrIndex); rindexIsGood = true; } + count = 0; map uniques; + map iUniques; + map pairUniques; map::iterator itUniques; - while ((!inForward.eof()) || (!inReverse.eof())) { + while ((!inForward.eof()) || (!inReverse.eof()) || (findexIsGood) || (rindexIsGood)) { - if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; } + if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; } //get a read from forward and reverse fastq files - bool ignoref, ignorer; - fastqRead thisFread, thisRread; + bool ignoref, ignorer, ignorefi, ignoreri; + fastqRead thisFread, thisRread, thisFIread, thisRIread; if (!inForward.eof()) { thisFread = readFastq(inForward, ignoref); } else { ignoref = true; } if (!inReverse.eof()) { thisRread = readFastq(inReverse, ignorer); } else { ignorer = true; } + if (findexIsGood) { thisFIread = readFastq(infIndex, ignorefi); if (infIndex.eof()) { findexIsGood = false; } } + else { ignorefi = true; } + if (rindexIsGood) { thisRIread = readFastq(inrIndex, ignoreri); if (inrIndex.eof()) { rindexIsGood = false; } } + else { ignoreri = true; } + + bool allowOne = false; + if ((findex == "") || (rindex == "")) { allowOne = true; } + vector frReads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false); + vector friReads = getReads(ignorefi, ignoreri, thisFIread, thisRIread, iUniques, allowOne); + + //add in index info if provided + vector reads = frReads; + if ((findex != "") || (rindex != "")) { reads = mergeReads(frReads, friReads, pairUniques); } - vector reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques); - for (int i = 0; i < reads.size(); i++) { fastqRead fread = reads[i].forward; fastqRead rread = reads[i].reverse; + fastqRead firead = reads[i].findex; + fastqRead riread = reads[i].rindex; - if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); } + if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); if (findex != "") { m->mothurOut(toString(count) + '\t' + firead.name + '\n'); } if (rindex != "") { m->mothurOut(toString(count) + '\t' + riread.name + '\n'); } } - if (checkReads(fread, rread, ffastq, rfastq)) { - if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; } + //if (checkReads(fread, rread, ffastq, rfastq)) { + if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; } //if the reads are okay write to output files int process = count % processors; @@ -1220,12 +1339,14 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c *(tempfiles[process][3]) << ">" << rread.name << endl; for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; } *(tempfiles[process][3]) << endl; - + if (findex != "") { *(tempfiles[process][4]) << ">" << firead.name << endl << firead.sequence << endl; } + if (rindex != "") { *(tempfiles[process][5]) << ">" << riread.name << endl << riread.sequence << endl; } + count++; //report progress - if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } - } + if((count) % 10000 == 0){ m->mothurOutJustToScreen(toString(count)); m->mothurOutEndLine(); } + //} } } //report progress @@ -1233,8 +1354,13 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c if (uniques.size() != 0) { for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) { + if (m->control_pressed) { break; } m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n"); } + for (map:: iterator it = pairUniques.begin(); it != pairUniques.end(); it++) { + if (m->control_pressed) { break; } + m->mothurOut("[WARNING]: did not find paired read for " + (it->first).substr(1) + ", ignoring.\n"); + } m->mothurOutEndLine(); } @@ -1242,7 +1368,9 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } inForward.close(); inReverse.close(); - + if (findex != "") { infIndex.close(); } + if (rindex != "") { inrIndex.close(); } + return files; } catch(exception& e) { @@ -1276,8 +1404,10 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c if (fqualfile != "") { fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(fqualfile)) + toString(i) + "fqual.temp"; m->openOutputFile(fqualfilename, *outFQ); } string rqualfilename = ""; if (rqualfile != "") { rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rqualfile)) + toString(i) + "rqual.temp"; m->openOutputFile(rqualfilename, *outRQ); } + string findexfilename = ""; string rindexfilename = ""; names.push_back(ffastafilename); names.push_back(fqualfilename); names.push_back(rfastafilename); names.push_back(rqualfilename); + names.push_back(findexfilename); names.push_back(rindexfilename); files.push_back(names); m->openOutputFile(ffastafilename, *outFF); @@ -1341,7 +1471,7 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c }else { ignorer = true; } } - vector reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques); + vector reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false); for (int i = 0; i < reads.size(); i++) { fastqRead fread = reads[i].forward; @@ -1349,7 +1479,7 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); } - if (checkReads(fread, rread, ffasta, rfasta)) { + // if (checkReads(fread, rread, ffasta, rfasta)) { if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inReverseFasta.close(); inForwardFasta.close(); if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); } return files; } //if the reads are okay write to output files @@ -1369,7 +1499,7 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c //report progress if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } - } + //} } } //report progress @@ -1396,7 +1526,7 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c } } //********************************************************************************************************************** -vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques){ +vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques, bool allowOne){ try { vector reads; map::iterator itUniques; @@ -1444,25 +1574,36 @@ vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, f } }else if (!ignoref && ignorer) { //ignore reverse keep forward - //look for forward pair - itUniques = uniques.find(forward.name); - if (itUniques != uniques.end()) { //we have the pair for this read - pairFastqRead temp(forward, itUniques->second); + if (allowOne) { + fastqRead dummy; + pairFastqRead temp(forward, dummy); reads.push_back(temp); - uniques.erase(itUniques); - }else { //save this read for later - uniques[forward.name] = forward; + }else { + //look for forward pair + itUniques = uniques.find(forward.name); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(forward, itUniques->second); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques[forward.name] = forward; + } } - }else if (ignoref && !ignorer) { //ignore forward keep reverse - //look for reverse pair - itUniques = uniques.find(reverse.name); - if (itUniques != uniques.end()) { //we have the pair for this read - pairFastqRead temp(itUniques->second, reverse); + if (allowOne) { + fastqRead dummy; + pairFastqRead temp(dummy, reverse); reads.push_back(temp); - uniques.erase(itUniques); - }else { //save this read for later - uniques[reverse.name] = reverse; + }else { + //look for reverse pair + itUniques = uniques.find(reverse.name); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(itUniques->second, reverse); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques[reverse.name] = reverse; + } } }//else ignore both and do nothing @@ -1474,6 +1615,73 @@ vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, f } } //********************************************************************************************************************** +//look through the reads from the forward and reverse files and try to find matching reads from index files. +vector MakeContigsCommand::mergeReads(vector thisReads, vector indexes, map& uniques){ + try { + vector reads; + map::iterator itUniques; + + set foundIndexes; + for (int i = 0; i < thisReads.size(); i++) { + bool found = false; + for (int j = 0; j < indexes.size(); j++) { + + //incase only one index + string indexName = indexes[j].forward.name; + if (indexName == "") { indexName = indexes[j].reverse.name; } + + if (thisReads[i].forward.name == indexName){ + thisReads[i].findex = indexes[j].forward; + thisReads[i].rindex = indexes[j].reverse; + reads.push_back(thisReads[i]); + found = true; + foundIndexes.insert(j); + } + } + + if (!found) { + //look for forward pair + itUniques = uniques.find('i'+thisReads[i].forward.name); + if (itUniques != uniques.end()) { //we have the pair for this read + thisReads[i].findex = itUniques->second.forward; + thisReads[i].rindex = itUniques->second.reverse; + reads.push_back(thisReads[i]); + uniques.erase(itUniques); + }else { //save this read for later + uniques['r'+thisReads[i].forward.name] = thisReads[i]; + } + } + } + + if (foundIndexes.size() != indexes.size()) { //if we didnt match all the indexes look for them in uniques + for (int j = 0; j < indexes.size(); j++) { + if (foundIndexes.count(j) == 0) { //we didnt find this one + //incase only one index + string indexName = indexes[j].forward.name; + if (indexName == "") { indexName = indexes[j].reverse.name; } + + //look for forward pair + itUniques = uniques.find('r'+indexName); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(itUniques->second.forward, itUniques->second.reverse, indexes[j].forward, indexes[j].reverse); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques['i'+indexName] = indexes[j]; + } + } + } + } + + + return reads; + } + catch(exception& e) { + m->errorOut(e, "MakeContigsCommand", "mergeReads"); + exit(1); + } +} +//********************************************************************************************************************** fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ try { fastqRead read; @@ -1508,18 +1716,14 @@ fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } } if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; } - vector qualScores; - int controlChar = int('!'); - for (int i = 0; i < quality.length(); i++) { - int temp = int(quality[i]); - temp -= controlChar; - - qualScores.push_back(temp); - } - + vector qualScores = convertQual(quality); + + m->checkName(name); read.name = name; read.sequence = sequence; read.scores = qualScores; + + if (m->debug) { m->mothurOut("[DEBUG]: " + read.name + " " + read.sequence + " " + quality + "\n"); } return read; } @@ -1528,7 +1732,7 @@ fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ exit(1); } } -//********************************************************************************************************************** +/********************************************************************************************************************** bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, string ffile, string rfile){ try { bool good = true; @@ -1551,12 +1755,18 @@ bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, stri m->errorOut(e, "MakeContigsCommand", "checkReads"); exit(1); } -} +}*/ //*************************************************************************************************************** +//lines can be 2, 3, or 4 columns +// forward.fastq reverse.fastq -> 2 column +// groupName forward.fastq reverse.fastq -> 3 column +// forward.fastq reverse.fastq forward.index.fastq reverse.index.fastq -> 4 column +// forward.fastq reverse.fastq none reverse.index.fastq -> 4 column +// forward.fastq reverse.fastq forward.index.fastq none -> 4 column vector< vector > MakeContigsCommand::readFileNames(string filename){ try { vector< vector > files; - string forward, reverse; + string forward, reverse, findex, rindex; ifstream in; m->openInputFile(filename, in); @@ -1565,8 +1775,53 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ if (m->control_pressed) { return files; } - in >> forward; m->gobble(in); - in >> reverse; m->gobble(in); + string line = m->getline(in); m->gobble(in); + vector pieces = m->splitWhiteSpace(line); + + string group = ""; + if (pieces.size() == 2) { + forward = pieces[0]; + reverse = pieces[1]; + group = ""; + findex = ""; + rindex = ""; + }else if (pieces.size() == 3) { + group = pieces[0]; + forward = pieces[1]; + reverse = pieces[2]; + findex = ""; + rindex = ""; + createFileGroup = true; + }else if (pieces.size() == 4) { + forward = pieces[0]; + reverse = pieces[1]; + findex = pieces[2]; + rindex = pieces[3]; + if ((findex == "none") || (findex == "NONE")){ findex = ""; } + if ((rindex == "none") || (rindex == "NONE")){ rindex = ""; } + }else { + m->mothurOut("[ERROR]: file lines can be 2, 3, or 4 columns. The forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. \n"); m->control_pressed = true; + } + + if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ", forwardIndex = " + findex + ", reverseIndex = " + rindex + ".\n"); } + + if (inputDir != "") { + string path = m->hasPath(forward); + if (path == "") { forward = inputDir + forward; } + + path = m->hasPath(reverse); + if (path == "") { reverse = inputDir + reverse; } + + if (findex != "") { + path = m->hasPath(findex); + if (path == "") { findex = inputDir + findex; } + } + + if (rindex != "") { + path = m->hasPath(rindex); + if (path == "") { rindex = inputDir + rindex; } + } + } //check to make sure both are able to be opened ifstream in2; @@ -1631,20 +1886,92 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ m->mothurOut("[WARNING]: can't find " + reverse + ", ignoring pair.\n"); }else{ in3.close(); } - if ((openForward != 1) && (openReverse != 1)) { //good pair + int openFindex = 0; + if (findex != "") { + ifstream in4; + openFindex = m->openInputFile(findex, in4, "noerror"); in4.close(); + + //if you can't open it, try default location + if (openFindex == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(findex); + m->mothurOut("Unable to open " + findex + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in5; + openFindex = m->openInputFile(tryPath, in5, "noerror"); + in5.close(); + findex = tryPath; + } + } + + //if you can't open it, try output location + if (openFindex == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(findex); + m->mothurOut("Unable to open " + findex + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in6; + openFindex = m->openInputFile(tryPath, in6, "noerror"); + findex = tryPath; + in6.close(); + } + } + + if (openFindex == 1) { //can't find it + m->mothurOut("[WARNING]: can't find " + findex + ", ignoring pair.\n"); + } + } + + int openRindex = 0; + if (rindex != "") { + ifstream in7; + openRindex = m->openInputFile(rindex, in7, "noerror"); in7.close(); + + //if you can't open it, try default location + if (openRindex == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(rindex); + m->mothurOut("Unable to open " + rindex + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in8; + openRindex = m->openInputFile(tryPath, in8, "noerror"); + in8.close(); + rindex = tryPath; + } + } + + //if you can't open it, try output location + if (openRindex == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(rindex); + m->mothurOut("Unable to open " + rindex + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in9; + openRindex = m->openInputFile(tryPath, in9, "noerror"); + rindex = tryPath; + in9.close(); + } + } + + if (openRindex == 1) { //can't find it + m->mothurOut("[WARNING]: can't find " + rindex + ", ignoring pair.\n"); + } + } + + + if ((openForward != 1) && (openReverse != 1) && (openFindex != 1) && (openRindex != 1)) { //good pair + file2Group[files.size()] = group; vector pair; pair.push_back(forward); pair.push_back(reverse); + pair.push_back(findex); + pair.push_back(rindex); + if (((findex != "") || (rindex != "")) && (oligosfile == "")) { m->mothurOut("[ERROR]: You need to provide an oligos file if you are going to use an index file.\n"); m->control_pressed = true; } files.push_back(pair); } - } in.close(); return files; } catch(exception& e) { - m->errorOut(e, "MakeContigsCommand", "checkReads"); + m->errorOut(e, "MakeContigsCommand", "readFileNames"); exit(1); } } @@ -1652,152 +1979,53 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ //illumina data requires paired forward and reverse data //BARCODE atgcatgc atgcatgc groupName //PRIMER atgcatgc atgcatgc groupName -//PRIMER atgcatgc atgcatgc -bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vector >& qualFileNames, string rootname){ +//PRIMER atgcatgc atgcatgc +bool MakeContigsCommand::getOligos(vector >& fastaFileNames, string rootname, map& fastaFile2Group){ try { - ifstream in; - m->openInputFile(oligosfile, in); - - ofstream test; - - string type, foligo, roligo, group; + if (m->debug) { m->mothurOut("[DEBUG]: oligosfile = " + oligosfile + "\n"); } - int indexPrimer = 0; - int indexBarcode = 0; - set uniquePrimers; - set uniqueBarcodes; - - while(!in.eof()){ - - in >> type; - cout << type << endl; - if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } - - if(type[0] == '#'){ - while (!in.eof()) { char c = in.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there - m->gobble(in); - } - else{ - m->gobble(in); - //make type case insensitive - for(int i=0;i> foligo; - - if (m->debug) { m->mothurOut("[DEBUG]: reading - " + foligo + ".\n"); } - - for(int i=0;igobble(in); - - in >> roligo; - - for(int i=0;imothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); } - else { uniquePrimers.insert(tempPair); } - - if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); } } - - primers[indexPrimer]=newPrimer; indexPrimer++; - primerNameVector.push_back(group); - }else if(type == "BARCODE"){ - m->gobble(in); - - in >> roligo; - - for(int i=0;idebug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); } - - //check for repeat barcodes - string tempPair = foligo+roligo; - if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse + " is in your oligos file already, disregarding."); m->mothurOutEndLine(); } - else { uniqueBarcodes.insert(tempPair); } - - barcodes[indexBarcode]=newPair; indexBarcode++; - barcodeNameVector.push_back(group); - cout << group << endl; - }else if(type == "LINKER"){ - linker.push_back(foligo); - m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n"); - }else if(type == "SPACER"){ - spacer.push_back(foligo); - m->mothurOut("[WARNING]: make.contigs is not setup to remove spacers, ignoring.\n"); - } - else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are primer, barcode, linker and spacer. Ignoring " + foligo + "."); m->mothurOutEndLine(); } - } - m->gobble(in); - } - in.close(); - - if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; } - - //add in potential combos - if(barcodeNameVector.size() == 0){ - oligosPair temp("", ""); - barcodes[0] = temp; - barcodeNameVector.push_back(""); - } - - if(primerNameVector.size() == 0){ - oligosPair temp("", ""); - primers[0] = temp; - primerNameVector.push_back(""); - } - - fastaFileNames.resize(barcodeNameVector.size()); + bool allBlank = false; + oligos->read(oligosfile, false); + + if (m->control_pressed) { return false; } //error in reading oligos + + if (oligos->hasPairedBarcodes()) { + numFPrimers = oligos->getPairedPrimers().size(); + numBarcodes = oligos->getPairedBarcodes().size(); + }else { + m->mothurOut("[ERROR]: make.contigs requires paired barcodes and primers. You can set one end to NONE if you are using an index file.\n"); m->control_pressed = true; + } + + if (m->control_pressed) { return false; } + + numLinkers = oligos->getLinkers().size(); + numSpacers = oligos->getSpacers().size(); + numRPrimers = oligos->getReversePrimers().size(); + if (numLinkers != 0) { m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n"); } + if (numSpacers != 0) { m->mothurOut("[WARNING]: make.contigs is not setup to remove spacers, ignoring.\n"); } + + vector groupNames = oligos->getGroupNames(); + if (groupNames.size() == 0) { allFiles = 0; allBlank = true; } + + + fastaFileNames.resize(oligos->getBarcodeNames().size()); for(int i=0;igetPrimerNames().size();j++){ fastaFileNames[i].push_back(""); } } - qualFileNames = fastaFileNames; - - if(allFiles){ - set uniqueNames; //used to cleanup outputFileNames - for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ - for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ - - string primerName = primerNameVector[itPrimer->first]; - string barcodeName = barcodeNameVector[itBar->first]; + + if (allFiles) { + set uniqueNames; //used to cleanup outputFileNames + map barcodes = oligos->getPairedBarcodes(); + map primers = oligos->getPairedPrimers(); + for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ + for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ - if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing - else { + string primerName = oligos->getPrimerName(itPrimer->first); + string barcodeName = oligos->getBarcodeName(itBar->first); + + if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing + else if ((primerName == "") && (barcodeName == "")) { } //do nothing + else { string comboGroupName = ""; string fastaFileName = ""; string qualFileName = ""; @@ -1805,112 +2033,86 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vect string countFileName = ""; if(primerName == ""){ - comboGroupName = barcodeNameVector[itBar->first]; - } - else{ + comboGroupName = barcodeName; + }else{ if(barcodeName == ""){ - comboGroupName = primerNameVector[itPrimer->first]; + comboGroupName = primerName; } else{ - comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first]; + comboGroupName = barcodeName + "." + primerName; } } ofstream temp; - fastaFileName = rootname + comboGroupName + ".fasta"; + map variables; + variables["[filename]"] = rootname; + variables["[tag]"] = comboGroupName; + fastaFileName = getOutputFileName("fasta", variables); if (uniqueNames.count(fastaFileName) == 0) { outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName); uniqueNames.insert(fastaFileName); + fastaFile2Group[fastaFileName] = comboGroupName; } fastaFileNames[itBar->first][itPrimer->first] = fastaFileName; m->openOutputFile(fastaFileName, temp); temp.close(); - - if ((fqualfile != "") || (ffastqfile != "") || (file != "")) { - qualFileName = rootname + ".qual"; - if (uniqueNames.count(qualFileName) == 0) { - outputNames.push_back(qualFileName); - outputTypes["qfile"].push_back(qualFileName); - } - - qualFileNames[itBar->first][itPrimer->first] = qualFileName; - m->openOutputFile(qualFileName, temp); temp.close(); - } + cout << fastaFileName << endl; } - } - } - } - - bool allBlank = true; - for (int i = 0; i < barcodeNameVector.size(); i++) { - if (barcodeNameVector[i] != "") { - allBlank = false; - break; - } - } - for (int i = 0; i < primerNameVector.size(); i++) { - if (primerNameVector[i] != "") { - allBlank = false; - break; - } - } - - if (allBlank) { - m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine(); - allFiles = false; - return false; - } - - return true; - + } + } + } + + if (allBlank) { + m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine(); + allFiles = false; + return false; + } + + return true; + } catch(exception& e) { m->errorOut(e, "MakeContigsCommand", "getOligos"); exit(1); } } -//********************************************************************/ -string MakeContigsCommand::reverseOligo(string oligo){ +//********************************************************************************************************************** +vector MakeContigsCommand::convertQual(string qual) { try { - string reverse = ""; - - for(int i=oligo.length()-1;i>=0;i--){ - - if(oligo[i] == 'A') { reverse += 'T'; } - else if(oligo[i] == 'T'){ reverse += 'A'; } - else if(oligo[i] == 'U'){ reverse += 'A'; } - - else if(oligo[i] == 'G'){ reverse += 'C'; } - else if(oligo[i] == 'C'){ reverse += 'G'; } - - else if(oligo[i] == 'R'){ reverse += 'Y'; } - else if(oligo[i] == 'Y'){ reverse += 'R'; } - - else if(oligo[i] == 'M'){ reverse += 'K'; } - else if(oligo[i] == 'K'){ reverse += 'M'; } - - else if(oligo[i] == 'W'){ reverse += 'W'; } - else if(oligo[i] == 'S'){ reverse += 'S'; } - - else if(oligo[i] == 'B'){ reverse += 'V'; } - else if(oligo[i] == 'V'){ reverse += 'B'; } - - else if(oligo[i] == 'D'){ reverse += 'H'; } - else if(oligo[i] == 'H'){ reverse += 'D'; } + vector qualScores; + bool negativeScores = false; + + for (int i = 0; i < qual.length(); i++) { + + int temp = 0; + temp = int(qual[i]); + if (format == "illumina") { + temp -= 64; //char '@' + }else if (format == "illumina1.8+") { + temp -= int('!'); //char '!' + }else if (format == "solexa") { + temp = int(convertTable[temp]); //convert to sanger + temp -= int('!'); //char '!' + }else { + temp -= int('!'); //char '!' + } - else { reverse += 'N'; } - } - + if (temp < -5) { negativeScores = true; } + qualScores.push_back(temp); + } + + if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n"); m->control_pressed = true; } - return reverse; - } + return qualScores; + } catch(exception& e) { - m->errorOut(e, "MakeContigsCommand", "reverseOligo"); + m->errorOut(e, "MakeContigsCommand", "convertQual"); exit(1); } } + //**********************************************************************************************************************