2 // makecontigscommand.cpp
5 // Created by Sarah Westcott on 5/15/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "makecontigscommand.h"
11 //**********************************************************************************************************************
12 vector<string> MakeContigsCommand::setParameters(){
14 CommandParameter pfastq("ffastq", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "fastqGroup","fasta-qfile",false,false,true); parameters.push_back(pfastq);
15 CommandParameter prfastq("rfastq", "InputTypes", "", "", "none", "none", "fastqGroup","fasta-qfile",false,false,true); parameters.push_back(prfastq);
16 CommandParameter pfasta("ffasta", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "fastaGroup","fasta",false,false,true); parameters.push_back(pfasta);
17 CommandParameter prfasta("rfasta", "InputTypes", "", "", "none", "none", "none","fastaGroup",false,false,true); parameters.push_back(prfasta);
18 CommandParameter pfqual("fqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(pfqual);
19 CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(prqual);
20 CommandParameter pfile("file", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "none","fasta-qfile",false,false,true); parameters.push_back(pfile);
21 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
22 CommandParameter pfindex("findex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pfindex);
23 CommandParameter prindex("rindex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(prindex);
24 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
25 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
26 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
28 CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "","",false,false); parameters.push_back(palign);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles);
30 CommandParameter ptrimoverlap("trimoverlap", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptrimoverlap);
31 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch);
32 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch);
33 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen);
34 CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend);
35 CommandParameter pthreshold("insert", "Number", "", "20", "", "", "","",false,false); parameters.push_back(pthreshold);
36 CommandParameter pdeltaq("deltaq", "Number", "", "6", "", "", "","",false,false); parameters.push_back(pdeltaq);
37 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
38 CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "illumina1.8+", "", "", "","",false,false,true); parameters.push_back(pformat);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "MakeContigsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string MakeContigsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. \n";
56 helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n";
57 helpString += "If a forward index or reverse index file is provided barcodes be trimmed, and a group file will be created. The oligos parameter is required if an index file is given.\n";
58 helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, findex, rindex, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n";
59 helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n";
60 helpString += "The file parameter is 2, 3 or 4 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n";
61 helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n";
62 helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n";
63 helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n";
64 helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n";
65 helpString += "The findex and rindex parameters are used to provide a forward index and reverse index files to process. \n";
66 helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n";
67 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
68 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
69 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
70 //helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
71 //helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
72 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
73 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n";
74 helpString += "The deltaq parameter allows you to specify the delta allowed between quality scores of a mismatched base. For example in the overlap, if deltaq=5 and in the alignment seqA, pos 200 has a quality score of 30 and the same position in seqB has a quality score of 20, you take the base from seqA (30-20 >= 5). If the quality score in seqB is 28 then the base in the consensus will be an N (30-28<5) The default is 6.\n";
75 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
76 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n";
77 helpString += "The insert parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score equal to or below the threshold we eliminate it. Default=20.\n";
78 helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
79 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
81 helpString += "The trimoverlap parameter allows you to trim the sequences to only the overlapping section. The default is F.\n";
82 helpString += "The make.contigs command should be in the following format: \n";
83 helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n";
84 helpString += "Note: No spaces between parameter labels (i.e. ffastq), '=' and parameters (i.e.yourForwardFastqFile).\n";
88 m->errorOut(e, "MakeContigsCommand", "getHelpString");
92 //**********************************************************************************************************************
93 string MakeContigsCommand::getOutputPattern(string type) {
97 if (type == "fasta") { pattern = "[filename],[tag],contigs.fasta"; }
98 else if (type == "group") { pattern = "[filename],[tag],contigs.groups"; }
99 else if (type == "report") { pattern = "[filename],[tag],contigs.report"; }
100 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
104 catch(exception& e) {
105 m->errorOut(e, "MakeContigsCommand", "getOutputPattern");
109 //**********************************************************************************************************************
110 MakeContigsCommand::MakeContigsCommand(){
112 abort = true; calledHelp = true;
114 vector<string> tempOutNames;
115 outputTypes["fasta"] = tempOutNames;
116 outputTypes["group"] = tempOutNames;
117 outputTypes["report"] = tempOutNames;
119 catch(exception& e) {
120 m->errorOut(e, "MakeContigsCommand", "MakeContigsCommand");
124 //**********************************************************************************************************************
125 MakeContigsCommand::MakeContigsCommand(string option) {
127 abort = false; calledHelp = false;
128 createFileGroup = false; createOligosGroup = false;
130 //allow user to run help
131 if(option == "help") { help(); abort = true; calledHelp = true; }
132 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
135 vector<string> myArray = setParameters();
137 OptionParser parser(option);
138 map<string, string> parameters = parser.getParameters();
140 ValidParameters validParameter("pairwise.seqs");
141 map<string, string>::iterator it;
143 //check to make sure all parameters are valid for command
144 for (it = parameters.begin(); it != parameters.end(); it++) {
145 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
148 //initialize outputTypes
149 vector<string> tempOutNames;
150 outputTypes["fasta"] = tempOutNames;
151 outputTypes["report"] = tempOutNames;
152 outputTypes["group"] = tempOutNames;
155 //if the user changes the input directory command factory will send this info to us in the output parameter
156 string inputDir = validParameter.validFile(parameters, "inputdir", false);
157 if (inputDir == "not found"){ inputDir = ""; }
160 it = parameters.find("ffastq");
161 //user has given a template file
162 if(it != parameters.end()){
163 path = m->hasPath(it->second);
164 //if the user has not given a path then, add inputdir. else leave path alone.
165 if (path == "") { parameters["ffastq"] = inputDir + it->second; }
168 it = parameters.find("rfastq");
169 //user has given a template file
170 if(it != parameters.end()){
171 path = m->hasPath(it->second);
172 //if the user has not given a path then, add inputdir. else leave path alone.
173 if (path == "") { parameters["rfastq"] = inputDir + it->second; }
176 it = parameters.find("ffasta");
177 //user has given a template file
178 if(it != parameters.end()){
179 path = m->hasPath(it->second);
180 //if the user has not given a path then, add inputdir. else leave path alone.
181 if (path == "") { parameters["ffasta"] = inputDir + it->second; }
184 it = parameters.find("rfasta");
185 //user has given a template file
186 if(it != parameters.end()){
187 path = m->hasPath(it->second);
188 //if the user has not given a path then, add inputdir. else leave path alone.
189 if (path == "") { parameters["rfasta"] = inputDir + it->second; }
192 it = parameters.find("fqfile");
193 //user has given a template file
194 if(it != parameters.end()){
195 path = m->hasPath(it->second);
196 //if the user has not given a path then, add inputdir. else leave path alone.
197 if (path == "") { parameters["fqfile"] = inputDir + it->second; }
200 it = parameters.find("rqfile");
201 //user has given a template file
202 if(it != parameters.end()){
203 path = m->hasPath(it->second);
204 //if the user has not given a path then, add inputdir. else leave path alone.
205 if (path == "") { parameters["rqfile"] = inputDir + it->second; }
208 it = parameters.find("file");
209 //user has given a template file
210 if(it != parameters.end()){
211 path = m->hasPath(it->second);
212 //if the user has not given a path then, add inputdir. else leave path alone.
213 if (path == "") { parameters["file"] = inputDir + it->second; }
216 it = parameters.find("oligos");
217 //user has given a template file
218 if(it != parameters.end()){
219 path = m->hasPath(it->second);
220 //if the user has not given a path then, add inputdir. else leave path alone.
221 if (path == "") { parameters["oligos"] = inputDir + it->second; }
224 it = parameters.find("findex");
225 //user has given a template file
226 if(it != parameters.end()){
227 path = m->hasPath(it->second);
228 //if the user has not given a path then, add inputdir. else leave path alone.
229 if (path == "") { parameters["findex"] = inputDir + it->second; }
232 it = parameters.find("rindex");
233 //user has given a template file
234 if(it != parameters.end()){
235 path = m->hasPath(it->second);
236 //if the user has not given a path then, add inputdir. else leave path alone.
237 if (path == "") { parameters["rindex"] = inputDir + it->second; }
241 ffastqfile = validParameter.validFile(parameters, "ffastq", true);
242 if (ffastqfile == "not open") { abort = true; }
243 else if (ffastqfile == "not found") { ffastqfile = ""; }
245 rfastqfile = validParameter.validFile(parameters, "rfastq", true);
246 if (rfastqfile == "not open") { abort = true; }
247 else if (rfastqfile == "not found") { rfastqfile = ""; }
249 ffastafile = validParameter.validFile(parameters, "ffasta", true);
250 if (ffastafile == "not open") { abort = true; }
251 else if (ffastafile == "not found") { ffastafile = ""; }
253 rfastafile = validParameter.validFile(parameters, "rfasta", true);
254 if (rfastafile == "not open") { abort = true; }
255 else if (rfastafile == "not found") { rfastafile = ""; }
257 fqualfile = validParameter.validFile(parameters, "fqfile", true);
258 if (fqualfile == "not open") { abort = true; }
259 else if (fqualfile == "not found") { fqualfile = ""; }
261 rqualfile = validParameter.validFile(parameters, "rqfile", true);
262 if (rqualfile == "not open") { abort = true; }
263 else if (rqualfile == "not found") { rqualfile = ""; }
265 file = validParameter.validFile(parameters, "file", true);
266 if (file == "not open") { abort = true; }
267 else if (file == "not found") { file = ""; }
270 if ((file == "") && (ffastafile == "") && (ffastqfile == "")) { abort = true; m->mothurOut("[ERROR]: The file, ffastq and rfastq or ffasta and rfasta parameters are required.\n"); }
271 if ((file != "") && ((ffastafile != "") || (ffastqfile != ""))) { abort = true; m->mothurOut("[ERROR]: The file, ffastq and rfastq or ffasta and rfasta parameters are required.\n"); }
272 if ((ffastqfile != "") && (rfastqfile == "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the ffastq, you must provide a rfastq file.\n"); }
273 if ((ffastqfile == "") && (rfastqfile != "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the rfastq, you must provide a ffastq file.\n"); }
274 if ((ffastafile != "") && (rfastafile == "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the ffasta, you must provide a rfasta file.\n"); }
275 if ((ffastafile == "") && (rfastafile != "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the rfasta, you must provide a ffasta file.\n"); }
276 if ((fqualfile != "") && (rqualfile == "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the fqfile, you must provide a rqfile file.\n"); }
277 if ((fqualfile == "") && (rqualfile != "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the rqfile, you must provide a fqfile file.\n"); }
278 if (((fqualfile != "") || (rqualfile != "")) && ((ffastafile == "") || (rfastafile == ""))) {
279 abort = true; m->mothurOut("[ERROR]: If you provide use the rqfile or fqfile file, you must provide the ffasta and rfasta parameters.\n");
282 oligosfile = validParameter.validFile(parameters, "oligos", true);
283 if (oligosfile == "not found") { oligosfile = ""; }
284 else if(oligosfile == "not open") { abort = true; }
285 else { m->setOligosFile(oligosfile); }
287 findexfile = validParameter.validFile(parameters, "findex", true);
288 if (findexfile == "not found") { findexfile = ""; }
289 else if(findexfile == "not open") { abort = true; }
291 rindexfile = validParameter.validFile(parameters, "rindex", true);
292 if (rindexfile == "not found") { rindexfile = ""; }
293 else if(rindexfile == "not open") { abort = true; }
295 if ((rindexfile != "") || (findexfile != "")) {
296 if (oligosfile == ""){
297 oligosfile = m->getOligosFile();
298 if (oligosfile != "") { m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter.\n"); }
300 m->mothurOut("You need to provide an oligos file if you are going to use an index file.\n"); abort = true;
304 //can only use an index file with the fastq parameters not fasta and qual
305 if ((ffastafile != "") || (rfastafile != "")) {
306 m->mothurOut("[ERROR]: You can only use an index file with the fastq parameters or the file option.\n"); abort = true;
310 //if the user changes the output directory command factory will send this info to us in the output parameter
311 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
316 //check for optional parameter and set defaults
317 // ...at some point should added some additional type checking...
319 temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; }
320 m->mothurConvert(temp, match);
322 temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; }
323 m->mothurConvert(temp, misMatch);
324 if (misMatch > 0) { m->mothurOut("[ERROR]: mismatch must be negative.\n"); abort=true; }
326 temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; }
327 m->mothurConvert(temp, gapOpen);
328 if (gapOpen > 0) { m->mothurOut("[ERROR]: gapopen must be negative.\n"); abort=true; }
330 temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; }
331 m->mothurConvert(temp, gapExtend);
332 if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
334 temp = validParameter.validFile(parameters, "insert", false); if (temp == "not found"){ temp = "20"; }
335 m->mothurConvert(temp, insert);
336 if ((insert < 0) || (insert > 40)) { m->mothurOut("[ERROR]: insert must be between 0 and 40.\n"); abort=true; }
338 temp = validParameter.validFile(parameters, "deltaq", false); if (temp == "not found"){ temp = "6"; }
339 m->mothurConvert(temp, deltaq);
341 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
342 m->setProcessors(temp);
343 m->mothurConvert(temp, processors);
345 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
346 m->mothurConvert(temp, bdiffs);
348 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
349 m->mothurConvert(temp, pdiffs);
351 // temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
352 // m->mothurConvert(temp, ldiffs);
355 // temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
356 // m->mothurConvert(temp, sdiffs);
359 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
360 m->mothurConvert(temp, tdiffs);
362 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } //+ ldiffs + sdiffs;
364 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
365 allFiles = m->isTrue(temp);
368 temp = validParameter.validFile(parameters, "trimoverlap", false); if (temp == "not found") { temp = "F"; }
369 trimOverlap = m->isTrue(temp);
371 align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; }
372 if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; }
374 format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "illumina1.8+"; }
376 if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) {
377 m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine();
381 //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference.
382 for (int i = -64; i < 65; i++) {
383 char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499));
384 convertTable.push_back(temp);
389 catch(exception& e) {
390 m->errorOut(e, "MakeContigsCommand", "MakeContigsCommand");
394 //**********************************************************************************************************************
395 int MakeContigsCommand::execute(){
397 if (abort == true) { if (calledHelp) { return 0; } return 2; }
399 //read ffastq and rfastq files creating fasta and qual files.
400 //this function will create a forward and reverse, fasta and qual files for each processor.
401 //files has an entry for each processor. files[i][0] = forwardFasta, files[i][1] = forwardQual, files[i][2] = reverseFasta, files[i][3] = reverseQual. filesToProcess is for each filepair in the file parameter file. for ffastq and rfastq this will be size 1.
402 unsigned long int numReads = 0;
403 int start = time(NULL);
405 m->mothurOut("Reading fastq data...\n");
406 vector < vector< vector<string> > > filesToProcess = preProcessData(numReads);
407 m->mothurOut("Done.\n");
409 if (m->control_pressed) { return 0; }
411 map<string, string> cvars;
412 string compOutputDir = outputDir;
413 if (outputDir == "") { compOutputDir = m->hasPath(file); }
414 cvars["[filename]"] = compOutputDir + m->getRootName(m->getSimpleName(file));
416 string compositeGroupFile = getOutputFileName("group",cvars);
417 cvars["[tag]"] = "trim";
418 string compositeFastaFile = getOutputFileName("fasta",cvars);
419 cvars["[tag]"] = "scrap";
420 string compositeScrapFastaFile = getOutputFileName("fasta",cvars);
422 string compositeMisMatchFile = getOutputFileName("report",cvars);
424 if (filesToProcess.size() > 1) { //clear files for append below
425 ofstream outCTFasta, outCTQual, outCSFasta, outCSQual, outCMisMatch;
426 m->openOutputFile(compositeFastaFile, outCTFasta); outCTFasta.close();
427 m->openOutputFile(compositeScrapFastaFile, outCSFasta); outCSFasta.close();
428 m->openOutputFile(compositeMisMatchFile, outCMisMatch); outCMisMatch.close();
429 outputNames.push_back(compositeFastaFile); outputTypes["fasta"].push_back(compositeFastaFile);
430 outputNames.push_back(compositeMisMatchFile); outputTypes["report"].push_back(compositeMisMatchFile);
431 outputNames.push_back(compositeScrapFastaFile); outputTypes["fasta"].push_back(compositeScrapFastaFile);
434 map<string, int> totalGroupCounts;
436 for (int l = 0; l < filesToProcess.size(); l++) {
438 m->mothurOut("\n>>>>>\tProcessing " + filesToProcess[l][0][0] + " (file " + toString(l+1) + " of " + toString(filesToProcess.size()) + ")\t<<<<<\n");
442 vector<vector<string> > fastaFileNames;
443 createOligosGroup = false;
444 string outputGroupFileName;
445 map<string, string> variables;
446 string thisOutputDir = outputDir;
447 if (outputDir == "") { thisOutputDir = m->hasPath(filesToProcess[l][0][0]); }
448 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(filesToProcess[l][0][0]));
449 variables["[tag]"] = "";
450 if(oligosfile != ""){ createOligosGroup = getOligos(fastaFileNames, variables["[filename]"]); }
451 if (createOligosGroup || createFileGroup) {
452 outputGroupFileName = getOutputFileName("group",variables);
455 //give group in file file precedence
456 if (createFileGroup) { createOligosGroup = false; }
458 variables["[tag]"] = "trim";
459 string outFastaFile = getOutputFileName("fasta",variables);
460 variables["[tag]"] = "scrap";
461 string outScrapFastaFile = getOutputFileName("fasta",variables);
462 variables["[tag]"] = "";
463 string outMisMatchFile = getOutputFileName("report",variables);
465 m->mothurOut("Making contigs...\n");
466 createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames, l);
468 //remove temp fasta and qual files
469 for (int i = 0; i < processors; i++) { for(int j = 0; j < filesToProcess[l][i].size(); j++) { m->mothurRemove(filesToProcess[l][i][j]); } }
471 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
474 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
475 map<string, string>::iterator it;
476 set<string> namesToRemove;
477 for(int i=0;i<fastaFileNames.size();i++){
478 for(int j=0;j<fastaFileNames[0].size();j++){
479 if (fastaFileNames[i][j] != "") {
480 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
481 if(m->isBlank(fastaFileNames[i][j])){
482 m->mothurRemove(fastaFileNames[i][j]);
483 namesToRemove.insert(fastaFileNames[i][j]);
485 it = uniqueFastaNames.find(fastaFileNames[i][j]);
486 if (it == uniqueFastaNames.end()) {
487 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
495 //remove names for outputFileNames, just cleans up the output
496 vector<string> outputNames2;
497 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
498 outputNames = outputNames2;
500 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
502 m->openInputFile(it->first, in);
505 string thisGroupName = thisOutputDir + m->getRootName(m->getSimpleName(it->first));
506 thisGroupName += getOutputFileName("group",variables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
507 m->openOutputFile(thisGroupName, out);
510 if (m->control_pressed) { break; }
512 Sequence currSeq(in); m->gobble(in);
513 out << currSeq.getName() << '\t' << it->second << endl;
520 if (createFileGroup || createOligosGroup) {
522 m->openOutputFile(outputGroupFileName, outGroup);
523 for (map<string, string>::iterator itGroup = groupMap.begin(); itGroup != groupMap.end(); itGroup++) {
524 outGroup << itGroup->first << '\t' << itGroup->second << endl;
529 if (filesToProcess.size() > 1) { //merge into large combo files
530 if (createFileGroup || createOligosGroup) {
533 m->openOutputFile(compositeGroupFile, outCGroup); outCGroup.close();
534 outputNames.push_back(compositeGroupFile); outputTypes["group"].push_back(compositeGroupFile);
536 m->appendFiles(outputGroupFileName, compositeGroupFile);
537 if (!allFiles) { m->mothurRemove(outputGroupFileName); }
538 else { outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); }
540 for (map<string, int>::iterator itGroups = groupCounts.begin(); itGroups != groupCounts.end(); itGroups++) {
541 map<string, int>::iterator itTemp = totalGroupCounts.find(itGroups->first);
542 if (itTemp == totalGroupCounts.end()) { totalGroupCounts[itGroups->first] = itGroups->second; } //new group create it in totalGroups
543 else { itTemp->second += itGroups->second; } //existing group, update total
546 if (l == 0) { m->appendFiles(outMisMatchFile, compositeMisMatchFile); }
547 else { m->appendFilesWithoutHeaders(outMisMatchFile, compositeMisMatchFile); }
548 m->appendFiles(outFastaFile, compositeFastaFile);
549 m->appendFiles(outScrapFastaFile, compositeScrapFastaFile);
551 m->mothurRemove(outMisMatchFile);
552 m->mothurRemove(outFastaFile);
553 m->mothurRemove(outScrapFastaFile);
555 outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
556 outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile);
557 outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile);
560 totalGroupCounts = groupCounts;
561 outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
562 outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile);
563 outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile);
564 if (createFileGroup || createOligosGroup) {
565 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
568 m->mothurOut("Done.\n");
571 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(numReads) + " sequences.\n");
573 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
575 //output group counts
576 m->mothurOutEndLine();
578 if (totalGroupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
579 for (map<string, int>::iterator it = totalGroupCounts.begin(); it != totalGroupCounts.end(); it++) {
580 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
582 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
584 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
586 string currentFasta = "";
587 itTypes = outputTypes.find("fasta");
588 if (itTypes != outputTypes.end()) {
589 if ((itTypes->second).size() != 0) { currentFasta = (itTypes->second)[0]; m->setFastaFile(currentFasta); }
592 string currentGroup = "";
593 itTypes = outputTypes.find("group");
594 if (itTypes != outputTypes.end()) {
595 if ((itTypes->second).size() != 0) { currentGroup = (itTypes->second)[0]; m->setGroupFile(currentGroup); }
598 //output files created by command
599 m->mothurOutEndLine();
600 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
601 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
602 m->mothurOutEndLine();
606 catch(exception& e) {
607 m->errorOut(e, "MakeContigsCommand", "execute");
611 //**********************************************************************************************************************
612 vector< vector< vector<string> > > MakeContigsCommand::preProcessData(unsigned long int& numReads) {
614 vector< vector< vector<string> > > filesToProcess;
616 if (ffastqfile != "") { //reading one file
617 vector< vector<string> > files = readFastqFiles(numReads, ffastqfile, rfastqfile, findexfile, rindexfile);
618 //adjust for really large processors or really small files
619 if (numReads == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; }
620 if (numReads < processors) {
621 for (int i = numReads; i < processors; i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } files[i].clear(); }
622 files.resize(numReads);
623 processors = numReads;
625 filesToProcess.push_back(files);
626 }else if (file != "") { //reading multiple files
627 //return only valid pairs
628 vector< vector<string> > filePairsToProcess = readFileNames(file);
630 if (m->control_pressed) { return filesToProcess; }
632 if (filePairsToProcess.size() != 0) {
633 for (int i = 0; i < filePairsToProcess.size(); i++) {
635 if (m->control_pressed) { for (int l = 0; l < filesToProcess.size(); l++) { for (int k = 0; k < filesToProcess[l].size(); k++) { for(int j = 0; j < filesToProcess[l][k].size(); j++) { m->mothurRemove(filesToProcess[l][k][j]); } filesToProcess[l][k].clear(); } return filesToProcess; } }
637 unsigned long int thisFilesReads;
638 vector< vector<string> > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1], filePairsToProcess[i][2], filePairsToProcess[i][3]);
640 //adjust for really large processors or really small files
641 if (thisFilesReads < processors) {
642 m->mothurOut("[ERROR]: " + filePairsToProcess[i][0] + " has less than " + toString(processors) + " good reads, skipping\n");
643 for (int k = 0; k < files.size(); k++) { for(int j = 0; j < files[k].size(); j++) { m->mothurRemove(files[k][j]); } files[k].clear(); }
644 //remove from file2Group if necassary
645 map<int, string> cFile2Group;
646 for (map<int, string>::iterator it = file2Group.begin(); it != file2Group.end(); it++) {
647 if ((it->first) < i) { cFile2Group[it->first] = it->second; }
648 else if ((it->first) == i) { } //do nothing, we removed files for i
649 else { cFile2Group[(it->first-1)] = it->second; } //adjust files because i was removed
651 file2Group = cFile2Group;
653 filesToProcess.push_back(files);
654 numReads += thisFilesReads;
658 if (numReads == 0) { m->control_pressed = true; }
660 }else if (ffastafile != "") {
661 vector< vector<string> > files = readFastaFiles(numReads, ffastafile, rfastafile);
662 //adjust for really large processors or really small files
663 if (numReads == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; }
664 if (numReads < processors) {
665 for (int i = numReads; i < processors; i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } files[i].clear(); }
666 files.resize(numReads);
667 processors = numReads;
669 filesToProcess.push_back(files);
670 }else { m->control_pressed = true; } //should not get here
672 return filesToProcess;
674 catch(exception& e) {
675 m->errorOut(e, "MakeContigsCommand", "preProcessData");
679 //**********************************************************************************************************************
680 int MakeContigsCommand::createProcesses(vector< vector<string> > files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector<vector<string> > fastaFileNames, int index) {
683 vector<int> processIDS;
685 map<int, string>::iterator it = file2Group.find(index);
686 if (it != file2Group.end()) { group = it->second; }
688 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
691 //loop through and create all the processes you want
692 while (process != processors-1) {
696 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
699 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
704 for(int i=0;i<tempFASTAFileNames.size();i++){
705 for(int j=0;j<tempFASTAFileNames[i].size();j++){
706 if (tempFASTAFileNames[i][j] != "") {
707 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
708 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
714 num = driver(files[process],
715 outputFasta + toString(getpid()) + ".temp",
716 outputScrapFasta + toString(getpid()) + ".temp",
717 outputMisMatches + toString(getpid()) + ".temp",
718 tempFASTAFileNames, process, group);
720 //pass groupCounts to parent
722 string tempFile = toString(getpid()) + ".num.temp";
723 m->openOutputFile(tempFile, out);
725 if (createFileGroup || createOligosGroup) {
726 out << groupCounts.size() << endl;
728 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
729 out << it->first << '\t' << it->second << endl;
732 out << groupMap.size() << endl;
733 for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
734 out << it->first << '\t' << it->second << endl;
741 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
742 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
748 m->openOutputFile(outputFasta, temp); temp.close();
749 m->openOutputFile(outputScrapFasta, temp); temp.close();
752 num = driver(files[processors-1], outputFasta, outputScrapFasta, outputMisMatches, fastaFileNames, processors-1, group);
754 //force parent to wait until all the processes are done
755 for (int i=0;i<processIDS.size();i++) {
756 int temp = processIDS[i];
760 for (int i = 0; i < processIDS.size(); i++) {
762 string tempFile = toString(processIDS[i]) + ".num.temp";
763 m->openInputFile(tempFile, in);
765 in >> tempNum; num += tempNum; m->gobble(in);
767 if (createFileGroup || createOligosGroup) {
769 in >> tempNum; m->gobble(in);
772 for (int j = 0; j < tempNum; j++) {
774 in >> group >> groupNum; m->gobble(in);
776 map<string, int>::iterator it = groupCounts.find(group);
777 if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
778 else { groupCounts[it->first] += groupNum; }
781 in >> tempNum; m->gobble(in);
783 for (int j = 0; j < tempNum; j++) {
784 string group, seqName;
785 in >> seqName >> group; m->gobble(in);
787 map<string, string>::iterator it = groupMap.find(seqName);
788 if (it == groupMap.end()) { groupMap[seqName] = group; }
789 else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
793 in.close(); m->mothurRemove(tempFile);
797 //////////////////////////////////////////////////////////////////////////////////////////////////////
798 //Windows version shared memory, so be careful when passing variables through the contigsData struct.
799 //Above fork() will clone, so memory is separate, but that's not the case with windows,
800 //////////////////////////////////////////////////////////////////////////////////////////////////////
802 vector<contigsData*> pDataArray;
803 DWORD dwThreadIdArray[processors-1];
804 HANDLE hThreadArray[processors-1];
806 //Create processor worker threads.
807 for( int h=0; h<processors-1; h++ ){
808 string extension = "";
809 if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
810 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
815 for(int i=0;i<tempFASTAFileNames.size();i++){
816 for(int j=0;j<tempFASTAFileNames[i].size();j++){
817 if (tempFASTAFileNames[i][j] != "") {
818 tempFASTAFileNames[i][j] += extension;
819 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
825 contigsData* tempcontig = new contigsData(group, files[h], (outputFasta + extension), (outputScrapFasta + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, insert, deltaq, barcodes, primers, tempFASTAFileNames, barcodeNameVector, primerNameVector, pdiffs, bdiffs, tdiffs, createOligosGroup, createFileGroup, allFiles, trimOverlap, h);
826 pDataArray.push_back(tempcontig);
828 hThreadArray[h] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
831 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
835 string extension = toString(processors-1) + ".temp";
837 for(int i=0;i<tempFASTAFileNames.size();i++){
838 for(int j=0;j<tempFASTAFileNames[i].size();j++){
839 if (tempFASTAFileNames[i][j] != "") {
840 tempFASTAFileNames[i][j] += extension;
841 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
849 m->openOutputFile(outputFasta, temp); temp.close();
850 m->openOutputFile(outputScrapFasta, temp); temp.close();
853 processIDS.push_back(processors-1);
854 num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, processors-1, group);
856 //Wait until all threads have terminated.
857 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
859 //Close all thread handles and free memory allocations.
860 for(int i=0; i < pDataArray.size(); i++){
861 num += pDataArray[i]->count;
862 if (!pDataArray[i]->done) {
863 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of sequences assigned to it, quitting. \n"); m->control_pressed = true;
865 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
866 map<string, int>::iterator it2 = groupCounts.find(it->first);
867 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
868 else { groupCounts[it->first] += it->second; }
870 for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
871 map<string, string>::iterator it2 = groupMap.find(it->first);
872 if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
873 else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
875 CloseHandle(hThreadArray[i]);
876 delete pDataArray[i];
881 for (int i = 0; i < processIDS.size(); i++) {
882 m->appendFiles((outputFasta + toString(processIDS[i]) + ".temp"), outputFasta);
883 m->mothurRemove((outputFasta + toString(processIDS[i]) + ".temp"));
885 m->appendFiles((outputScrapFasta + toString(processIDS[i]) + ".temp"), outputScrapFasta);
886 m->mothurRemove((outputScrapFasta + toString(processIDS[i]) + ".temp"));
888 m->appendFilesWithoutHeaders((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches);
889 m->mothurRemove((outputMisMatches + toString(processIDS[i]) + ".temp"));
892 for(int j=0;j<fastaFileNames.size();j++){
893 for(int k=0;k<fastaFileNames[j].size();k++){
894 if (fastaFileNames[j][k] != "") {
895 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
896 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
905 catch(exception& e) {
906 m->errorOut(e, "MakeContigsCommand", "createProcesses");
910 //**********************************************************************************************************************
911 int MakeContigsCommand::driver(vector<string> files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector<vector<string> > fastaFileNames, int process, string group){
914 Alignment* alignment;
915 if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
916 else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
919 string thisffastafile = files[0];
920 string thisfqualfile = files[1];
921 string thisrfastafile = files[2];
922 string thisrqualfile = files[3];
923 string thisfindexfile = files[4];
924 string thisrindexfile = files[5];
926 if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
928 ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
929 ofstream outFasta, outMisMatch, outScrapFasta;
930 m->openInputFile(thisffastafile, inFFasta);
931 m->openInputFile(thisrfastafile, inRFasta);
932 if (thisfqualfile != "") {
933 m->openInputFile(thisfqualfile, inFQual);
934 m->openInputFile(thisrqualfile, inRQual);
937 if (thisfindexfile != "") { m->openInputFile(thisfindexfile, inFIndex); }
938 if (thisrindexfile != "") { m->openInputFile(thisrindexfile, inRIndex); }
940 m->openOutputFile(outputFasta, outFasta);
941 m->openOutputFile(outputScrapFasta, outScrapFasta);
942 m->openOutputFile(outputMisMatches, outMisMatch);
943 outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n";
945 TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, primers, barcodes);
947 while ((!inFFasta.eof()) && (!inRFasta.eof())) {
949 if (m->control_pressed) { break; }
952 string trashCode = "";
953 int currentSeqsDiffs = 0;
955 //read seqs and quality info
956 Sequence fSeq(inFFasta); m->gobble(inFFasta);
957 Sequence rSeq(inRFasta); m->gobble(inRFasta);
958 QualityScores* fQual = NULL; QualityScores* rQual = NULL;
959 if (thisfqualfile != "") {
960 fQual = new QualityScores(inFQual); m->gobble(inFQual);
961 rQual = new QualityScores(inRQual); m->gobble(inRQual);
963 Sequence findexBarcode("findex", "NONE"); Sequence rindexBarcode("rindex", "NONE");
964 if (thisfindexfile != "") {
965 Sequence temp(inFIndex); m->gobble(inFIndex);
966 findexBarcode.setAligned(temp.getAligned());
969 if (thisrindexfile != "") {
970 Sequence temp(inRIndex); m->gobble(inRIndex);
971 rindexBarcode.setAligned(temp.getAligned());
974 int barcodeIndex = 0;
977 if(barcodes.size() != 0){
978 if (thisfqualfile != "") {
979 if ((thisfindexfile != "") || (thisrindexfile != "")) {
980 success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
982 success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
985 success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
987 if(success > bdiffs) { trashCode += 'b'; }
988 else{ currentSeqsDiffs += success; }
991 if(primers.size() != 0){
992 if (thisfqualfile != "") {
993 success = trimOligos.stripForward(fSeq, rSeq, *fQual, *rQual, primerIndex);
995 success = trimOligos.stripForward(fSeq, rSeq, primerIndex);
997 if(success > pdiffs) { trashCode += 'f'; }
998 else{ currentSeqsDiffs += success; }
1001 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
1003 //flip the reverse reads
1004 rSeq.reverseComplement();
1005 if (thisfqualfile != "") { rQual->flipQScores(); }
1008 alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
1009 map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
1010 map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
1011 fSeq.setAligned(alignment->getSeqAAln());
1012 rSeq.setAligned(alignment->getSeqBAln());
1013 int length = fSeq.getAligned().length();
1015 //traverse alignments merging into one contiguous seq
1017 int numMismatches = 0;
1018 string seq1 = fSeq.getAligned();
1019 string seq2 = rSeq.getAligned();
1020 vector<int> scores1, scores2;
1021 if (thisfqualfile != "") {
1022 scores1 = fQual->getQualityScores();
1023 scores2 = rQual->getQualityScores();
1024 delete fQual; delete rQual;
1027 // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; }
1028 int overlapStart = fSeq.getStartPos();
1029 int seq2Start = rSeq.getStartPos();
1031 //bigger of the 2 starting positions is the location of the overlapping start
1032 if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
1033 overlapStart = seq2Start;
1034 for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; }
1035 }else { //seq1 starts later so take from 0 to overlapStart from seq2
1036 for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; }
1039 int seq1End = fSeq.getEndPos();
1040 int seq2End = rSeq.getEndPos();
1041 int overlapEnd = seq1End;
1042 if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends
1044 int oStart = contig.length();
1045 for (int i = overlapStart; i < overlapEnd; i++) {
1046 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
1048 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base
1049 if (thisfqualfile != "") {
1050 if (scores2[BBaseMap[i]] <= insert) { } //
1051 else { contig += seq2[i]; }
1052 }else { contig += seq2[i]; } //with no quality info, then we keep it?
1053 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base
1054 if (thisfqualfile != "") {
1055 if (scores1[ABaseMap[i]] <= insert) { } //
1056 else { contig += seq1[i]; }
1057 }else { contig += seq1[i]; } //with no quality info, then we keep it?
1058 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
1059 if (thisfqualfile != "") {
1060 if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score
1062 if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; }
1064 }else { //if no, base becomes n
1068 }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch
1069 }else { //should never get here
1070 m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
1073 int oend = contig.length();
1074 if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
1075 for (int i = overlapEnd; i < length; i++) { contig += seq2[i]; }
1076 }else { //seq2 ends before seq1 so take from overlap to length from seq1
1077 for (int i = overlapEnd; i < length; i++) { contig += seq1[i]; }
1080 if (trimOverlap) { contig = contig.substr(overlapStart-1, oend-oStart); if (contig.length() == 0) { trashCode += "l"; } }
1082 if(trashCode.length() == 0){
1083 bool ignore = false;
1085 if (m->debug) { m->mothurOut(fSeq.getName()); }
1087 if (createOligosGroup) {
1088 if(barcodes.size() != 0){
1089 string thisGroup = barcodeNameVector[barcodeIndex];
1090 if (primers.size() != 0) {
1091 if (primerNameVector[primerIndex] != "") {
1092 if(thisGroup != "") {
1093 thisGroup += "." + primerNameVector[primerIndex];
1095 thisGroup = primerNameVector[primerIndex];
1100 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
1102 int pos = thisGroup.find("ignore");
1103 if (pos == string::npos) {
1104 groupMap[fSeq.getName()] = thisGroup;
1106 map<string, int>::iterator it = groupCounts.find(thisGroup);
1107 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
1108 else { groupCounts[it->first] ++; }
1109 }else { ignore = true; }
1112 }else if (createFileGroup) {
1113 int pos = group.find("ignore");
1114 if (pos == string::npos) {
1115 groupMap[fSeq.getName()] = group;
1117 map<string, int>::iterator it = groupCounts.find(group);
1118 if (it == groupCounts.end()) { groupCounts[group] = 1; }
1119 else { groupCounts[it->first] ++; }
1120 }else { ignore = true; }
1122 if (m->debug) { m->mothurOut("\n"); }
1124 if(allFiles && !ignore){
1126 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
1127 output << ">" << fSeq.getName() << endl << contig << endl;
1132 outFasta << ">" << fSeq.getName() << endl << contig << endl;
1134 for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; } }
1135 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl;
1138 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
1143 if((num) % 1000 == 0){ m->mothurOut(toString(num)); m->mothurOutEndLine(); }
1147 if((num) % 1000 != 0){ m->mothurOut(toString(num)); m->mothurOutEndLine(); }
1152 outScrapFasta.close();
1153 outMisMatch.close();
1154 if (thisfqualfile != "") {
1160 if (m->control_pressed) { m->mothurRemove(outputFasta); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches); }
1164 catch(exception& e) {
1165 m->errorOut(e, "MakeContigsCommand", "driver");
1169 //**********************************************************************************************************************
1170 vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq, string findex, string rindex){
1172 vector< vector<string> > files;
1173 //maps processors number to file pointer
1174 map<int, vector<ofstream*> > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual, tempfiles[4] = forwardIndex, [4] = forwardReverse
1175 map<int, vector<ofstream*> >::iterator it;
1177 //create files to write to
1178 for (int i = 0; i < processors; i++) {
1179 vector<ofstream*> temp;
1180 ofstream* outFF = new ofstream; temp.push_back(outFF);
1181 ofstream* outFQ = new ofstream; temp.push_back(outFQ);
1182 ofstream* outRF = new ofstream; temp.push_back(outRF);
1183 ofstream* outRQ = new ofstream; temp.push_back(outRQ);
1184 ofstream* outFI = new ofstream; temp.push_back(outFI);
1185 ofstream* outRI = new ofstream; temp.push_back(outRI);
1186 tempfiles[i] = temp;
1188 vector<string> names;
1189 string thisOutputDir = outputDir;
1190 if (outputDir == "") { thisOutputDir = m->hasPath(ffastq); }
1191 string ffastafilename = thisOutputDir + m->getRootName(m->getSimpleName(ffastq)) + toString(i) + "ffastatemp";
1192 string rfastafilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rfastatemp";
1193 string fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(ffastq)) + toString(i) + "fqualtemp";
1194 string rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rqualtemp";
1195 string findexfilename = ""; string rindexfilename = "";
1196 noneOk = false; //flag to oligos file read that its okay to allow for non paired barcodes
1197 if (findex != "") { findexfilename = thisOutputDir + m->getRootName(m->getSimpleName(findex)) + toString(i) + "findextemp"; m->openOutputFile(findexfilename, *outFI); noneOk = true; }
1198 if (rindex != "") { rindexfilename = thisOutputDir + m->getRootName(m->getSimpleName(rindex)) + toString(i) + "rindextemp"; m->openOutputFile(rindexfilename, *outRI); noneOk = true; }
1199 names.push_back(ffastafilename); names.push_back(fqualfilename);
1200 names.push_back(rfastafilename); names.push_back(rqualfilename);
1201 names.push_back(findexfilename); names.push_back(rindexfilename);
1202 files.push_back(names);
1204 m->openOutputFile(ffastafilename, *outFF);
1205 m->openOutputFile(rfastafilename, *outRF);
1206 m->openOutputFile(fqualfilename, *outFQ);
1207 m->openOutputFile(rqualfilename, *outRQ);
1210 if (m->control_pressed) {
1211 //close files, delete ofstreams
1212 for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
1214 for (int i = 0; i < files.size(); i++) {
1215 for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } }
1220 m->openInputFile(ffastq, inForward);
1223 m->openInputFile(rfastq, inReverse);
1225 ifstream infIndex, inrIndex;
1226 bool findexIsGood = false;
1227 bool rindexIsGood = false;
1228 if (findex != "") { m->openInputFile(findex, infIndex); findexIsGood = true; }
1229 if (rindex != "") { m->openInputFile(rindex, inrIndex); rindexIsGood = true; }
1232 map<string, fastqRead> uniques;
1233 map<string, fastqRead> iUniques;
1234 map<string, pairFastqRead> pairUniques;
1235 map<string, fastqRead>::iterator itUniques;
1236 while ((!inForward.eof()) || (!inReverse.eof()) || (findexIsGood) || (rindexIsGood)) {
1238 if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; }
1240 //get a read from forward and reverse fastq files
1241 bool ignoref, ignorer, ignorefi, ignoreri;
1242 fastqRead thisFread, thisRread, thisFIread, thisRIread;
1243 if (!inForward.eof()) { thisFread = readFastq(inForward, ignoref); }
1244 else { ignoref = true; }
1245 if (!inReverse.eof()) { thisRread = readFastq(inReverse, ignorer); }
1246 else { ignorer = true; }
1247 if (findexIsGood) { thisFIread = readFastq(infIndex, ignorefi); if (infIndex.eof()) { findexIsGood = false; } }
1248 else { ignorefi = true; }
1249 if (rindexIsGood) { thisRIread = readFastq(inrIndex, ignoreri); if (inrIndex.eof()) { rindexIsGood = false; } }
1250 else { ignoreri = true; }
1252 bool allowOne = false;
1253 if ((findex == "") || (rindex == "")) { allowOne = true; }
1254 vector<pairFastqRead> frReads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false);
1255 vector<pairFastqRead> friReads = getReads(ignorefi, ignoreri, thisFIread, thisRIread, iUniques, allowOne);
1257 //add in index info if provided
1258 vector<pairFastqRead> reads = frReads;
1259 if ((findex != "") || (rindex != "")) { reads = mergeReads(frReads, friReads, pairUniques); }
1261 for (int i = 0; i < reads.size(); i++) {
1262 fastqRead fread = reads[i].forward;
1263 fastqRead rread = reads[i].reverse;
1264 fastqRead firead = reads[i].findex;
1265 fastqRead riread = reads[i].rindex;
1267 if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); if (findex != "") { m->mothurOut(toString(count) + '\t' + firead.name + '\n'); } if (rindex != "") { m->mothurOut(toString(count) + '\t' + riread.name + '\n'); } }
1269 //if (checkReads(fread, rread, ffastq, rfastq)) {
1270 if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; }
1272 //if the reads are okay write to output files
1273 int process = count % processors;
1275 *(tempfiles[process][0]) << ">" << fread.name << endl << fread.sequence << endl;
1276 *(tempfiles[process][1]) << ">" << fread.name << endl;
1277 for (int i = 0; i < fread.scores.size(); i++) { *(tempfiles[process][1]) << fread.scores[i] << " "; }
1278 *(tempfiles[process][1]) << endl;
1279 *(tempfiles[process][2]) << ">" << rread.name << endl << rread.sequence << endl;
1280 *(tempfiles[process][3]) << ">" << rread.name << endl;
1281 for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; }
1282 *(tempfiles[process][3]) << endl;
1283 if (findex != "") { *(tempfiles[process][4]) << ">" << firead.name << endl << firead.sequence << endl; }
1284 if (rindex != "") { *(tempfiles[process][5]) << ">" << riread.name << endl << riread.sequence << endl; }
1289 if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
1294 if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
1296 if (uniques.size() != 0) {
1297 for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) {
1298 m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n");
1300 for (map<string, pairFastqRead>:: iterator it = pairUniques.begin(); it != pairUniques.end(); it++) {
1301 m->mothurOut("[WARNING]: did not find paired read for " + (it->first).substr(1) + ", ignoring.\n");
1303 m->mothurOutEndLine();
1306 //close files, delete ofstreams
1307 for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
1310 if (findex != "") { infIndex.close(); }
1311 if (rindex != "") { inrIndex.close(); }
1315 catch(exception& e) {
1316 m->errorOut(e, "MakeContigsCommand", "readFastqFiles");
1320 //**********************************************************************************************************************
1321 vector< vector<string> > MakeContigsCommand::readFastaFiles(unsigned long int& count, string ffasta, string rfasta){
1323 vector< vector<string> > files;
1324 //maps processors number to file pointer
1325 map<int, vector<ofstream*> > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual
1326 map<int, vector<ofstream*> >::iterator it;
1328 //create files to write to
1329 for (int i = 0; i < processors; i++) {
1330 vector<ofstream*> temp;
1331 ofstream* outFF = new ofstream; temp.push_back(outFF);
1332 ofstream* outFQ = new ofstream; temp.push_back(outFQ);
1333 ofstream* outRF = new ofstream; temp.push_back(outRF);
1334 ofstream* outRQ = new ofstream; temp.push_back(outRQ);
1335 tempfiles[i] = temp;
1337 vector<string> names;
1338 string thisOutputDir = outputDir;
1339 if (outputDir == "") { thisOutputDir = m->hasPath(ffasta); }
1340 string ffastafilename = thisOutputDir + m->getRootName(m->getSimpleName(ffasta)) + toString(i) + "ffastatemp";
1341 string rfastafilename = thisOutputDir + m->getRootName(m->getSimpleName(rfasta)) + toString(i) + "rfastatemp";
1342 string fqualfilename = "";
1343 if (fqualfile != "") { fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(fqualfile)) + toString(i) + "fqual.temp"; m->openOutputFile(fqualfilename, *outFQ); }
1344 string rqualfilename = "";
1345 if (rqualfile != "") { rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rqualfile)) + toString(i) + "rqual.temp"; m->openOutputFile(rqualfilename, *outRQ); }
1346 string findexfilename = ""; string rindexfilename = "";
1347 names.push_back(ffastafilename); names.push_back(fqualfilename);
1348 names.push_back(rfastafilename); names.push_back(rqualfilename);
1349 names.push_back(findexfilename); names.push_back(rindexfilename);
1350 files.push_back(names);
1352 m->openOutputFile(ffastafilename, *outFF);
1353 m->openOutputFile(rfastafilename, *outRF);
1356 if (m->control_pressed) {
1357 //close files, delete ofstreams
1358 for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
1360 for (int i = 0; i < files.size(); i++) {
1361 for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); }
1365 ifstream inForwardFasta;
1366 m->openInputFile(ffasta, inForwardFasta);
1368 ifstream inReverseFasta;
1369 m->openInputFile(rfasta, inReverseFasta);
1371 ifstream inForwardQual, inReverseQual;
1372 if (fqualfile != "") { m->openInputFile(fqualfile, inForwardQual); m->openInputFile(rqualfile, inReverseQual); }
1375 map<string, fastqRead> uniques;
1376 map<string, fastqRead>::iterator itUniques;
1377 while ((!inForwardFasta.eof()) || (!inReverseFasta.eof())) {
1379 if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inReverseFasta.close(); inForwardFasta.close(); if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); } return files; }
1381 //get a reads from forward and reverse fasta files
1382 bool ignoref, ignorer;
1383 fastqRead thisFread, thisRread;
1384 if (!inForwardFasta.eof()) {
1386 Sequence temp(inForwardFasta); m->gobble(inForwardFasta);
1387 thisFread.name = temp.getName();
1388 thisFread.sequence = temp.getUnaligned();
1389 }else { ignoref = true; }
1390 if (!inReverseFasta.eof()) {
1392 Sequence temp(inReverseFasta); m->gobble(inReverseFasta);
1393 thisRread.name = temp.getName();
1394 thisRread.sequence = temp.getUnaligned();
1395 }else { ignorer = true; }
1397 //get qual reads if given
1398 if (fqualfile != "") {
1399 if (!inForwardQual.eof() && !ignoref) {
1400 QualityScores temp(inForwardQual); m->gobble(inForwardQual);
1401 //if forward files dont match ignore read
1402 if (thisFread.name != temp.getName()) { ignoref = true; }
1403 else { thisFread.scores = temp.getQualityScores(); }
1404 }else { ignoref = true; }
1405 if (!inReverseQual.eof() && !ignorer) {
1406 QualityScores temp(inReverseQual); m->gobble(inReverseQual);
1407 //if reverse files dont match ignore read
1408 if (thisRread.name != temp.getName()) { ignorer = true; }
1409 else { thisRread.scores = temp.getQualityScores(); }
1410 }else { ignorer = true; }
1413 vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false);
1415 for (int i = 0; i < reads.size(); i++) {
1416 fastqRead fread = reads[i].forward;
1417 fastqRead rread = reads[i].reverse;
1419 if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); }
1421 // if (checkReads(fread, rread, ffasta, rfasta)) {
1422 if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inReverseFasta.close(); inForwardFasta.close(); if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); } return files; }
1424 //if the reads are okay write to output files
1425 int process = count % processors;
1427 *(tempfiles[process][0]) << ">" << fread.name << endl << fread.sequence << endl;
1428 *(tempfiles[process][2]) << ">" << rread.name << endl << rread.sequence << endl;
1429 if (fqualfile != "") { //if you have quality info, print it
1430 *(tempfiles[process][1]) << ">" << fread.name << endl;
1431 for (int i = 0; i < fread.scores.size(); i++) { *(tempfiles[process][1]) << fread.scores[i] << " "; }
1432 *(tempfiles[process][1]) << endl;
1433 *(tempfiles[process][3]) << ">" << rread.name << endl;
1434 for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; }
1435 *(tempfiles[process][3]) << endl;
1440 if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
1445 if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
1447 if (uniques.size() != 0) {
1448 for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) {
1449 m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n");
1451 m->mothurOutEndLine();
1454 //close files, delete ofstreams
1455 for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
1456 inReverseFasta.close();
1457 inForwardFasta.close();
1458 if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); }
1462 catch(exception& e) {
1463 m->errorOut(e, "MakeContigsCommand", "readFastaFiles");
1467 //**********************************************************************************************************************
1468 vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool allowOne){
1470 vector<pairFastqRead> reads;
1471 map<string, fastqRead>::iterator itUniques;
1473 if (!ignoref && !ignorer) {
1474 if (forward.name == reverse.name) {
1475 pairFastqRead temp(forward, reverse);
1476 reads.push_back(temp);
1479 //if no match are the names only different by 1 and 2?
1480 string tempFRead = forward.name.substr(0, forward.name.length()-1);
1481 string tempRRead = reverse.name.substr(0, reverse.name.length()-1);
1482 if (tempFRead == tempRRead) {
1483 if ((forward.name[forward.name.length()-1] == '1') && (reverse.name[reverse.name.length()-1] == '2')) {
1484 forward.name = tempFRead;
1485 reverse.name = tempRRead;
1486 pairFastqRead temp(forward, reverse);
1487 reads.push_back(temp);
1493 //look for forward pair
1494 itUniques = uniques.find(forward.name);
1495 if (itUniques != uniques.end()) { //we have the pair for this read
1496 pairFastqRead temp(forward, itUniques->second);
1497 reads.push_back(temp);
1498 uniques.erase(itUniques);
1499 }else { //save this read for later
1500 uniques[forward.name] = forward;
1503 //look for reverse pair
1504 itUniques = uniques.find(reverse.name);
1505 if (itUniques != uniques.end()) { //we have the pair for this read
1506 pairFastqRead temp(itUniques->second, reverse);
1507 reads.push_back(temp);
1508 uniques.erase(itUniques);
1509 }else { //save this read for later
1510 uniques[reverse.name] = reverse;
1515 }else if (!ignoref && ignorer) { //ignore reverse keep forward
1518 pairFastqRead temp(forward, dummy);
1519 reads.push_back(temp);
1521 //look for forward pair
1522 itUniques = uniques.find(forward.name);
1523 if (itUniques != uniques.end()) { //we have the pair for this read
1524 pairFastqRead temp(forward, itUniques->second);
1525 reads.push_back(temp);
1526 uniques.erase(itUniques);
1527 }else { //save this read for later
1528 uniques[forward.name] = forward;
1531 }else if (ignoref && !ignorer) { //ignore forward keep reverse
1534 pairFastqRead temp(dummy, reverse);
1535 reads.push_back(temp);
1537 //look for reverse pair
1538 itUniques = uniques.find(reverse.name);
1539 if (itUniques != uniques.end()) { //we have the pair for this read
1540 pairFastqRead temp(itUniques->second, reverse);
1541 reads.push_back(temp);
1542 uniques.erase(itUniques);
1543 }else { //save this read for later
1544 uniques[reverse.name] = reverse;
1547 }//else ignore both and do nothing
1551 catch(exception& e) {
1552 m->errorOut(e, "MakeContigsCommand", "readFastqFiles");
1556 //**********************************************************************************************************************
1557 //look through the reads from the forward and reverse files and try to find matching reads from index files.
1558 vector<pairFastqRead> MakeContigsCommand::mergeReads(vector<pairFastqRead> thisReads, vector<pairFastqRead> indexes, map<string, pairFastqRead>& uniques){
1560 vector<pairFastqRead> reads;
1561 map<string, pairFastqRead>::iterator itUniques;
1563 set<int> foundIndexes;
1564 for (int i = 0; i < thisReads.size(); i++) {
1566 for (int j = 0; j < indexes.size(); j++) {
1568 //incase only one index
1569 string indexName = indexes[j].forward.name;
1570 if (indexName == "") { indexName = indexes[j].reverse.name; }
1572 if (thisReads[i].forward.name == indexName){
1573 thisReads[i].findex = indexes[j].forward;
1574 thisReads[i].rindex = indexes[j].reverse;
1575 reads.push_back(thisReads[i]);
1577 foundIndexes.insert(j);
1582 //look for forward pair
1583 itUniques = uniques.find('i'+thisReads[i].forward.name);
1584 if (itUniques != uniques.end()) { //we have the pair for this read
1585 thisReads[i].findex = itUniques->second.forward;
1586 thisReads[i].rindex = itUniques->second.reverse;
1587 reads.push_back(thisReads[i]);
1588 uniques.erase(itUniques);
1589 }else { //save this read for later
1590 uniques['r'+thisReads[i].forward.name] = thisReads[i];
1595 if (foundIndexes.size() != indexes.size()) { //if we didnt match all the indexes look for them in uniques
1596 for (int j = 0; j < indexes.size(); j++) {
1597 if (foundIndexes.count(j) == 0) { //we didnt find this one
1598 //incase only one index
1599 string indexName = indexes[j].forward.name;
1600 if (indexName == "") { indexName = indexes[j].reverse.name; }
1602 //look for forward pair
1603 itUniques = uniques.find('r'+indexName);
1604 if (itUniques != uniques.end()) { //we have the pair for this read
1605 pairFastqRead temp(itUniques->second.forward, itUniques->second.reverse, indexes[j].forward, indexes[j].reverse);
1606 reads.push_back(temp);
1607 uniques.erase(itUniques);
1608 }else { //save this read for later
1609 uniques['i'+indexName] = indexes[j];
1618 catch(exception& e) {
1619 m->errorOut(e, "MakeContigsCommand", "mergeReads");
1623 //**********************************************************************************************************************
1624 fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){
1630 //read sequence name
1631 string line = m->getline(in); m->gobble(in);
1632 vector<string> pieces = m->splitWhiteSpace(line);
1633 string name = ""; if (pieces.size() != 0) { name = pieces[0]; }
1634 if (name == "") { m->mothurOut("[WARNING]: Blank fasta name, ignoring read."); m->mothurOutEndLine(); ignore=true; }
1635 else if (name[0] != '@') { m->mothurOut("[WARNING]: reading " + name + " expected a name with @ as a leading character, ignoring read."); m->mothurOutEndLine(); ignore=true; }
1636 else { name = name.substr(1); }
1639 string sequence = m->getline(in); m->gobble(in);
1640 if (sequence == "") { m->mothurOut("[WARNING]: missing sequence for " + name + ", ignoring."); ignore=true; }
1642 //read sequence name
1643 line = m->getline(in); m->gobble(in);
1644 pieces = m->splitWhiteSpace(line);
1645 string name2 = ""; if (pieces.size() != 0) { name2 = pieces[0]; }
1646 if (name2 == "") { m->mothurOut("[WARNING]: expected a name with + as a leading character, ignoring."); ignore=true; }
1647 else if (name2[0] != '+') { m->mothurOut("[WARNING]: reading " + name2 + " expected a name with + as a leading character, ignoring."); ignore=true; }
1648 else { name2 = name2.substr(1); if (name2 == "") { name2 = name; } }
1650 //read quality scores
1651 string quality = m->getline(in); m->gobble(in);
1652 if (quality == "") { m->mothurOut("[WARNING]: missing quality for " + name2 + ", ignoring."); ignore=true; }
1654 //sanity check sequence length and number of quality scores match
1655 if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } }
1656 if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; }
1658 vector<int> qualScores = convertQual(quality);
1661 read.sequence = sequence;
1662 read.scores = qualScores;
1664 if (m->debug) { m->mothurOut("[DEBUG]: " + read.name + " " + read.sequence + " " + quality + "\n"); }
1668 catch(exception& e) {
1669 m->errorOut(e, "MakeContigsCommand", "readFastq");
1673 /**********************************************************************************************************************
1674 bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, string ffile, string rfile){
1678 //do sequence lengths match
1679 if (forward.sequence.length() != reverse.sequence.length()) {
1680 m->mothurOut("[WARNING]: For sequence " + forward.name + " I read a sequence of length " + toString(forward.sequence.length()) + " from " + ffile + ", but read a sequence of length " + toString(reverse.sequence.length()) + " from " + rfile + ", ignoring.\n");
1684 //do number of qual scores match
1685 if (forward.scores.size() != reverse.scores.size()) {
1686 m->mothurOut("[WARNING]: For sequence " + forward.name + " I read " + toString(forward.scores.size()) + " quality scores from " + ffile + ", but read " + toString(reverse.scores.size()) + " quality scores from " + rfile + ", ignoring.\n");
1692 catch(exception& e) {
1693 m->errorOut(e, "MakeContigsCommand", "checkReads");
1697 //***************************************************************************************************************
1698 //lines can be 2, 3, or 4 columns
1699 // forward.fastq reverse.fastq -> 2 column
1700 // groupName forward.fastq reverse.fastq -> 3 column
1701 // forward.fastq reverse.fastq forward.index.fastq reverse.index.fastq -> 4 column
1702 // forward.fastq reverse.fastq none reverse.index.fastq -> 4 column
1703 // forward.fastq reverse.fastq forward.index.fastq none -> 4 column
1704 vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
1706 vector< vector<string> > files;
1707 string forward, reverse, findex, rindex;
1710 m->openInputFile(filename, in);
1714 if (m->control_pressed) { return files; }
1716 string line = m->getline(in); m->gobble(in);
1717 vector<string> pieces = m->splitWhiteSpace(line);
1720 if (pieces.size() == 2) {
1721 forward = pieces[0];
1722 reverse = pieces[1];
1726 }else if (pieces.size() == 3) {
1728 forward = pieces[1];
1729 reverse = pieces[2];
1732 createFileGroup = true;
1733 }else if (pieces.size() == 4) {
1734 forward = pieces[0];
1735 reverse = pieces[1];
1738 if ((findex == "none") || (findex == "NONE")){ findex = ""; }
1739 if ((rindex == "none") || (rindex == "NONE")){ rindex = ""; }
1741 m->mothurOut("[ERROR]: file lines can be 2, 3, or 4 columns. The forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. \n"); m->control_pressed = true;
1744 if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ", forwardIndex = " + findex + ", reverseIndex = " + rindex + ".\n"); }
1746 //check to make sure both are able to be opened
1748 int openForward = m->openInputFile(forward, in2, "noerror");
1750 //if you can't open it, try default location
1751 if (openForward == 1) {
1752 if (m->getDefaultPath() != "") { //default path is set
1753 string tryPath = m->getDefaultPath() + m->getSimpleName(forward);
1754 m->mothurOut("Unable to open " + forward + ". Trying default " + tryPath); m->mothurOutEndLine();
1756 openForward = m->openInputFile(tryPath, in3, "noerror");
1762 //if you can't open it, try output location
1763 if (openForward == 1) {
1764 if (m->getOutputDir() != "") { //default path is set
1765 string tryPath = m->getOutputDir() + m->getSimpleName(forward);
1766 m->mothurOut("Unable to open " + forward + ". Trying output directory " + tryPath); m->mothurOutEndLine();
1768 openForward = m->openInputFile(tryPath, in4, "noerror");
1774 if (openForward == 1) { //can't find it
1775 m->mothurOut("[WARNING]: can't find " + forward + ", ignoring pair.\n");
1776 }else{ in2.close(); }
1779 int openReverse = m->openInputFile(reverse, in3, "noerror");
1781 //if you can't open it, try default location
1782 if (openReverse == 1) {
1783 if (m->getDefaultPath() != "") { //default path is set
1784 string tryPath = m->getDefaultPath() + m->getSimpleName(reverse);
1785 m->mothurOut("Unable to open " + reverse + ". Trying default " + tryPath); m->mothurOutEndLine();
1787 openReverse = m->openInputFile(tryPath, in3, "noerror");
1793 //if you can't open it, try output location
1794 if (openReverse == 1) {
1795 if (m->getOutputDir() != "") { //default path is set
1796 string tryPath = m->getOutputDir() + m->getSimpleName(reverse);
1797 m->mothurOut("Unable to open " + reverse + ". Trying output directory " + tryPath); m->mothurOutEndLine();
1799 openReverse = m->openInputFile(tryPath, in4, "noerror");
1805 if (openReverse == 1) { //can't find it
1806 m->mothurOut("[WARNING]: can't find " + reverse + ", ignoring pair.\n");
1807 }else{ in3.close(); }
1812 openFindex = m->openInputFile(findex, in4, "noerror"); in4.close();
1814 //if you can't open it, try default location
1815 if (openFindex == 1) {
1816 if (m->getDefaultPath() != "") { //default path is set
1817 string tryPath = m->getDefaultPath() + m->getSimpleName(findex);
1818 m->mothurOut("Unable to open " + findex + ". Trying default " + tryPath); m->mothurOutEndLine();
1820 openFindex = m->openInputFile(tryPath, in5, "noerror");
1826 //if you can't open it, try output location
1827 if (openFindex == 1) {
1828 if (m->getOutputDir() != "") { //default path is set
1829 string tryPath = m->getOutputDir() + m->getSimpleName(findex);
1830 m->mothurOut("Unable to open " + findex + ". Trying output directory " + tryPath); m->mothurOutEndLine();
1832 openFindex = m->openInputFile(tryPath, in6, "noerror");
1838 if (openFindex == 1) { //can't find it
1839 m->mothurOut("[WARNING]: can't find " + findex + ", ignoring pair.\n");
1846 openRindex = m->openInputFile(rindex, in7, "noerror"); in7.close();
1848 //if you can't open it, try default location
1849 if (openRindex == 1) {
1850 if (m->getDefaultPath() != "") { //default path is set
1851 string tryPath = m->getDefaultPath() + m->getSimpleName(rindex);
1852 m->mothurOut("Unable to open " + rindex + ". Trying default " + tryPath); m->mothurOutEndLine();
1854 openRindex = m->openInputFile(tryPath, in8, "noerror");
1860 //if you can't open it, try output location
1861 if (openRindex == 1) {
1862 if (m->getOutputDir() != "") { //default path is set
1863 string tryPath = m->getOutputDir() + m->getSimpleName(rindex);
1864 m->mothurOut("Unable to open " + rindex + ". Trying output directory " + tryPath); m->mothurOutEndLine();
1866 openRindex = m->openInputFile(tryPath, in9, "noerror");
1872 if (openRindex == 1) { //can't find it
1873 m->mothurOut("[WARNING]: can't find " + rindex + ", ignoring pair.\n");
1878 if ((openForward != 1) && (openReverse != 1) && (openFindex != 1) && (openRindex != 1)) { //good pair
1879 file2Group[files.size()] = group;
1880 vector<string> pair;
1881 pair.push_back(forward);
1882 pair.push_back(reverse);
1883 pair.push_back(findex);
1884 pair.push_back(rindex);
1885 if (((findex != "") || (rindex != "")) && (oligosfile == "")) { m->mothurOut("[ERROR]: You need to provide an oligos file if you are going to use an index file.\n"); m->control_pressed = true; }
1886 files.push_back(pair);
1893 catch(exception& e) {
1894 m->errorOut(e, "MakeContigsCommand", "readFileNames");
1898 //***************************************************************************************************************
1899 //illumina data requires paired forward and reverse data
1900 //BARCODE atgcatgc atgcatgc groupName
1901 //PRIMER atgcatgc atgcatgc groupName
1902 //PRIMER atgcatgc atgcatgc
1903 bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, string rootname){
1906 m->openInputFile(oligosfile, in);
1910 string type, foligo, roligo, group;
1912 int indexPrimer = 0;
1913 int indexBarcode = 0;
1914 set<string> uniquePrimers;
1915 set<string> uniqueBarcodes;
1921 if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
1924 while (!in.eof()) { char c = in.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1929 //make type case insensitive
1930 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1934 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + foligo + ".\n"); }
1936 for(int i=0;i<foligo.length();i++){
1937 foligo[i] = toupper(foligo[i]);
1938 if(foligo[i] == 'U') { foligo[i] = 'T'; }
1941 if(type == "PRIMER"){
1946 for(int i=0;i<roligo.length();i++){
1947 roligo[i] = toupper(roligo[i]);
1948 if(roligo[i] == 'U') { roligo[i] = 'T'; }
1950 //roligo = reverseOligo(roligo);
1952 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + roligo + ".\n"); }
1956 // get rest of line in case there is a primer name
1959 if (c == 10 || c == 13 || c == -1){ break; }
1960 else if (c == 32 || c == 9){;} //space or tab
1961 else { group += c; }
1964 oligosPair newPrimer(foligo, roligo);
1966 if (m->debug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); }
1968 //check for repeat barcodes
1969 string tempPair = foligo+roligo;
1970 if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); }
1971 else { uniquePrimers.insert(tempPair); }
1973 if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); } }
1975 primers[indexPrimer]=newPrimer; indexPrimer++;
1976 primerNameVector.push_back(group);
1977 }else if(type == "BARCODE"){
1982 for(int i=0;i<roligo.length();i++){
1983 roligo[i] = toupper(roligo[i]);
1984 if(roligo[i] == 'U') { roligo[i] = 'T'; }
1986 //roligo = reverseOligo(roligo);
1988 oligosPair newPair(foligo, roligo);
1990 if ((foligo == "NONE") || (roligo == "NONE")) { if (!noneOk) { m->mothurOut("[ERROR]: barcodes must be paired unless you are using an index file.\n"); m->control_pressed = true; } }
1995 if (c == 10 || c == 13 || c == -1){ break; }
1996 else if (c == 32 || c == 9){;} //space or tab
1997 else { group += c; }
2000 if (m->debug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); }
2002 //check for repeat barcodes
2003 string tempPair = foligo+roligo;
2004 if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse + " is in your oligos file already, disregarding."); m->mothurOutEndLine(); }
2005 else { uniqueBarcodes.insert(tempPair); }
2007 barcodes[indexBarcode]=newPair; indexBarcode++;
2008 barcodeNameVector.push_back(group);
2009 }else if(type == "LINKER"){
2010 linker.push_back(foligo);
2011 m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n");
2012 }else if(type == "SPACER"){
2013 spacer.push_back(foligo);
2014 m->mothurOut("[WARNING]: make.contigs is not setup to remove spacers, ignoring.\n");
2016 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are primer, barcode, linker and spacer. Ignoring " + foligo + "."); m->mothurOutEndLine(); }
2022 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
2024 //add in potential combos
2025 if(barcodeNameVector.size() == 0){
2026 oligosPair temp("", "");
2028 barcodeNameVector.push_back("");
2031 if(primerNameVector.size() == 0){
2032 oligosPair temp("", "");
2034 primerNameVector.push_back("");
2037 fastaFileNames.resize(barcodeNameVector.size());
2038 for(int i=0;i<fastaFileNames.size();i++){
2039 fastaFileNames[i].assign(primerNameVector.size(), "");
2043 set<string> uniqueNames; //used to cleanup outputFileNames
2044 for(map<int, oligosPair>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
2045 for(map<int, oligosPair>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
2047 string primerName = primerNameVector[itPrimer->first];
2048 string barcodeName = barcodeNameVector[itBar->first];
2050 if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing
2052 string comboGroupName = "";
2053 string fastaFileName = "";
2054 string qualFileName = "";
2055 string nameFileName = "";
2056 string countFileName = "";
2058 if(primerName == ""){
2059 comboGroupName = barcodeNameVector[itBar->first];
2062 if(barcodeName == ""){
2063 comboGroupName = primerNameVector[itPrimer->first];
2066 comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first];
2072 fastaFileName = rootname + comboGroupName + ".fasta";
2073 if (uniqueNames.count(fastaFileName) == 0) {
2074 outputNames.push_back(fastaFileName);
2075 outputTypes["fasta"].push_back(fastaFileName);
2076 uniqueNames.insert(fastaFileName);
2079 fastaFileNames[itBar->first][itPrimer->first] = fastaFileName;
2080 m->openOutputFile(fastaFileName, temp); temp.close();
2086 bool allBlank = true;
2087 for (int i = 0; i < barcodeNameVector.size(); i++) {
2088 if (barcodeNameVector[i] != "") {
2093 for (int i = 0; i < primerNameVector.size(); i++) {
2094 if (primerNameVector[i] != "") {
2101 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
2109 catch(exception& e) {
2110 m->errorOut(e, "MakeContigsCommand", "getOligos");
2114 //********************************************************************/
2115 string MakeContigsCommand::reverseOligo(string oligo){
2117 string reverse = "";
2119 for(int i=oligo.length()-1;i>=0;i--){
2121 if(oligo[i] == 'A') { reverse += 'T'; }
2122 else if(oligo[i] == 'T'){ reverse += 'A'; }
2123 else if(oligo[i] == 'U'){ reverse += 'A'; }
2125 else if(oligo[i] == 'G'){ reverse += 'C'; }
2126 else if(oligo[i] == 'C'){ reverse += 'G'; }
2128 else if(oligo[i] == 'R'){ reverse += 'Y'; }
2129 else if(oligo[i] == 'Y'){ reverse += 'R'; }
2131 else if(oligo[i] == 'M'){ reverse += 'K'; }
2132 else if(oligo[i] == 'K'){ reverse += 'M'; }
2134 else if(oligo[i] == 'W'){ reverse += 'W'; }
2135 else if(oligo[i] == 'S'){ reverse += 'S'; }
2137 else if(oligo[i] == 'B'){ reverse += 'V'; }
2138 else if(oligo[i] == 'V'){ reverse += 'B'; }
2140 else if(oligo[i] == 'D'){ reverse += 'H'; }
2141 else if(oligo[i] == 'H'){ reverse += 'D'; }
2143 else { reverse += 'N'; }
2149 catch(exception& e) {
2150 m->errorOut(e, "MakeContigsCommand", "reverseOligo");
2154 //**********************************************************************************************************************
2155 vector<int> MakeContigsCommand::convertQual(string qual) {
2157 vector<int> qualScores;
2158 bool negativeScores = false;
2160 for (int i = 0; i < qual.length(); i++) {
2163 temp = int(qual[i]);
2164 if (format == "illumina") {
2165 temp -= 64; //char '@'
2166 }else if (format == "illumina1.8+") {
2167 temp -= int('!'); //char '!'
2168 }else if (format == "solexa") {
2169 temp = int(convertTable[temp]); //convert to sanger
2170 temp -= int('!'); //char '!'
2172 temp -= int('!'); //char '!'
2175 if (temp < -5) { negativeScores = true; }
2176 qualScores.push_back(temp);
2179 if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n"); m->control_pressed = true; }
2183 catch(exception& e) {
2184 m->errorOut(e, "MakeContigsCommand", "convertQual");
2189 //**********************************************************************************************************************