2 // makecontigscommand.cpp
5 // Created by Sarah Westcott on 5/15/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "makecontigscommand.h"
11 //**********************************************************************************************************************
12 vector<string> MakeContigsCommand::setParameters(){
14 CommandParameter pfastq("ffastq", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "fastqGroup","fasta-qfile",false,false,true); parameters.push_back(pfastq);
15 CommandParameter prfastq("rfastq", "InputTypes", "", "", "none", "none", "fastqGroup","fasta-qfile",false,false,true); parameters.push_back(prfastq);
16 CommandParameter pfasta("ffasta", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "fastaGroup","fasta",false,false,true); parameters.push_back(pfasta);
17 CommandParameter prfasta("rfasta", "InputTypes", "", "", "none", "none", "none","fastaGroup",false,false,true); parameters.push_back(prfasta);
18 CommandParameter pfqual("fqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(pfqual);
19 CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(prqual);
20 CommandParameter pfile("file", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "none","fasta-qfile",false,false,true); parameters.push_back(pfile);
21 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
22 CommandParameter pfindex("findex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pfindex);
23 CommandParameter prindex("rindex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(prindex);
24 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
25 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
26 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
27 CommandParameter preorient("checkorient", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(preorient);
28 CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "","",false,false); parameters.push_back(palign);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles);
30 CommandParameter ptrimoverlap("trimoverlap", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptrimoverlap);
31 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch);
32 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch);
33 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen);
34 CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend);
35 CommandParameter pthreshold("insert", "Number", "", "20", "", "", "","",false,false); parameters.push_back(pthreshold);
36 CommandParameter pdeltaq("deltaq", "Number", "", "6", "", "", "","",false,false); parameters.push_back(pdeltaq);
37 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
38 CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa-illumina1.8+", "illumina1.8+", "", "", "","",false,false,true); parameters.push_back(pformat);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "MakeContigsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string MakeContigsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. \n";
56 helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n";
57 helpString += "If a forward index or reverse index file is provided barcodes be trimmed, and a group file will be created. The oligos parameter is required if an index file is given.\n";
58 helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, findex, rindex, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n";
59 helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n";
60 helpString += "The file parameter is 2, 3 or 4 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n";
61 helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n";
62 helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n";
63 helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n";
64 helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n";
65 helpString += "The findex and rindex parameters are used to provide a forward index and reverse index files to process. \n";
66 helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n";
67 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
68 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
69 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
70 //helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
71 //helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
72 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
73 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n";
74 helpString += "The checkorient parameter will check look for the reverse compliment of the barcode or primer in the sequence. If found the sequence is flipped. The default is false.\n";
75 helpString += "The deltaq parameter allows you to specify the delta allowed between quality scores of a mismatched base. For example in the overlap, if deltaq=5 and in the alignment seqA, pos 200 has a quality score of 30 and the same position in seqB has a quality score of 20, you take the base from seqA (30-20 >= 5). If the quality score in seqB is 28 then the base in the consensus will be an N (30-28<5) The default is 6.\n";
76 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
77 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n";
78 helpString += "The insert parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score equal to or below the threshold we eliminate it. Default=20.\n";
79 helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
80 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
82 helpString += "The trimoverlap parameter allows you to trim the sequences to only the overlapping section. The default is F.\n";
83 helpString += "The make.contigs command should be in the following format: \n";
84 helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n";
85 helpString += "Note: No spaces between parameter labels (i.e. ffastq), '=' and parameters (i.e.yourForwardFastqFile).\n";
89 m->errorOut(e, "MakeContigsCommand", "getHelpString");
93 //**********************************************************************************************************************
94 string MakeContigsCommand::getOutputPattern(string type) {
98 if (type == "fasta") { pattern = "[filename],[tag],contigs.fasta"; }
99 else if (type == "group") { pattern = "[filename],[tag],contigs.groups"; }
100 else if (type == "report") { pattern = "[filename],[tag],contigs.report"; }
101 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
105 catch(exception& e) {
106 m->errorOut(e, "MakeContigsCommand", "getOutputPattern");
110 //**********************************************************************************************************************
111 MakeContigsCommand::MakeContigsCommand(){
113 abort = true; calledHelp = true;
115 vector<string> tempOutNames;
116 outputTypes["fasta"] = tempOutNames;
117 outputTypes["group"] = tempOutNames;
118 outputTypes["report"] = tempOutNames;
120 catch(exception& e) {
121 m->errorOut(e, "MakeContigsCommand", "MakeContigsCommand");
125 //**********************************************************************************************************************
126 MakeContigsCommand::MakeContigsCommand(string option) {
128 abort = false; calledHelp = false;
129 createFileGroup = false; createOligosGroup = false;
131 //allow user to run help
132 if(option == "help") { help(); abort = true; calledHelp = true; }
133 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
136 vector<string> myArray = setParameters();
138 OptionParser parser(option);
139 map<string, string> parameters = parser.getParameters();
141 ValidParameters validParameter("pairwise.seqs");
142 map<string, string>::iterator it;
144 //check to make sure all parameters are valid for command
145 for (it = parameters.begin(); it != parameters.end(); it++) {
146 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
149 //initialize outputTypes
150 vector<string> tempOutNames;
151 outputTypes["fasta"] = tempOutNames;
152 outputTypes["report"] = tempOutNames;
153 outputTypes["group"] = tempOutNames;
156 //if the user changes the input directory command factory will send this info to us in the output parameter
157 inputDir = validParameter.validFile(parameters, "inputdir", false);
158 if (inputDir == "not found"){ inputDir = ""; }
161 it = parameters.find("ffastq");
162 //user has given a template file
163 if(it != parameters.end()){
164 path = m->hasPath(it->second);
165 //if the user has not given a path then, add inputdir. else leave path alone.
166 if (path == "") { parameters["ffastq"] = inputDir + it->second; }
169 it = parameters.find("rfastq");
170 //user has given a template file
171 if(it != parameters.end()){
172 path = m->hasPath(it->second);
173 //if the user has not given a path then, add inputdir. else leave path alone.
174 if (path == "") { parameters["rfastq"] = inputDir + it->second; }
177 it = parameters.find("ffasta");
178 //user has given a template file
179 if(it != parameters.end()){
180 path = m->hasPath(it->second);
181 //if the user has not given a path then, add inputdir. else leave path alone.
182 if (path == "") { parameters["ffasta"] = inputDir + it->second; }
185 it = parameters.find("rfasta");
186 //user has given a template file
187 if(it != parameters.end()){
188 path = m->hasPath(it->second);
189 //if the user has not given a path then, add inputdir. else leave path alone.
190 if (path == "") { parameters["rfasta"] = inputDir + it->second; }
193 it = parameters.find("fqfile");
194 //user has given a template file
195 if(it != parameters.end()){
196 path = m->hasPath(it->second);
197 //if the user has not given a path then, add inputdir. else leave path alone.
198 if (path == "") { parameters["fqfile"] = inputDir + it->second; }
201 it = parameters.find("rqfile");
202 //user has given a template file
203 if(it != parameters.end()){
204 path = m->hasPath(it->second);
205 //if the user has not given a path then, add inputdir. else leave path alone.
206 if (path == "") { parameters["rqfile"] = inputDir + it->second; }
209 it = parameters.find("file");
210 //user has given a template file
211 if(it != parameters.end()){
212 path = m->hasPath(it->second);
213 //if the user has not given a path then, add inputdir. else leave path alone.
214 if (path == "") { parameters["file"] = inputDir + it->second; }
217 it = parameters.find("oligos");
218 //user has given a template file
219 if(it != parameters.end()){
220 path = m->hasPath(it->second);
221 //if the user has not given a path then, add inputdir. else leave path alone.
222 if (path == "") { parameters["oligos"] = inputDir + it->second; }
225 it = parameters.find("findex");
226 //user has given a template file
227 if(it != parameters.end()){
228 path = m->hasPath(it->second);
229 //if the user has not given a path then, add inputdir. else leave path alone.
230 if (path == "") { parameters["findex"] = inputDir + it->second; }
233 it = parameters.find("rindex");
234 //user has given a template file
235 if(it != parameters.end()){
236 path = m->hasPath(it->second);
237 //if the user has not given a path then, add inputdir. else leave path alone.
238 if (path == "") { parameters["rindex"] = inputDir + it->second; }
242 ffastqfile = validParameter.validFile(parameters, "ffastq", true);
243 if (ffastqfile == "not open") { abort = true; }
244 else if (ffastqfile == "not found") { ffastqfile = ""; }
246 rfastqfile = validParameter.validFile(parameters, "rfastq", true);
247 if (rfastqfile == "not open") { abort = true; }
248 else if (rfastqfile == "not found") { rfastqfile = ""; }
250 ffastafile = validParameter.validFile(parameters, "ffasta", true);
251 if (ffastafile == "not open") { abort = true; }
252 else if (ffastafile == "not found") { ffastafile = ""; }
254 rfastafile = validParameter.validFile(parameters, "rfasta", true);
255 if (rfastafile == "not open") { abort = true; }
256 else if (rfastafile == "not found") { rfastafile = ""; }
258 fqualfile = validParameter.validFile(parameters, "fqfile", true);
259 if (fqualfile == "not open") { abort = true; }
260 else if (fqualfile == "not found") { fqualfile = ""; }
262 rqualfile = validParameter.validFile(parameters, "rqfile", true);
263 if (rqualfile == "not open") { abort = true; }
264 else if (rqualfile == "not found") { rqualfile = ""; }
266 file = validParameter.validFile(parameters, "file", true);
267 if (file == "not open") { abort = true; }
268 else if (file == "not found") { file = ""; }
271 if ((file == "") && (ffastafile == "") && (ffastqfile == "")) { abort = true; m->mothurOut("[ERROR]: The file, ffastq and rfastq or ffasta and rfasta parameters are required.\n"); }
272 if ((file != "") && ((ffastafile != "") || (ffastqfile != ""))) { abort = true; m->mothurOut("[ERROR]: The file, ffastq and rfastq or ffasta and rfasta parameters are required.\n"); }
273 if ((ffastqfile != "") && (rfastqfile == "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the ffastq, you must provide a rfastq file.\n"); }
274 if ((ffastqfile == "") && (rfastqfile != "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the rfastq, you must provide a ffastq file.\n"); }
275 if ((ffastafile != "") && (rfastafile == "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the ffasta, you must provide a rfasta file.\n"); }
276 if ((ffastafile == "") && (rfastafile != "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the rfasta, you must provide a ffasta file.\n"); }
277 if ((fqualfile != "") && (rqualfile == "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the fqfile, you must provide a rqfile file.\n"); }
278 if ((fqualfile == "") && (rqualfile != "")) { abort = true; m->mothurOut("[ERROR]: If you provide use the rqfile, you must provide a fqfile file.\n"); }
279 if (((fqualfile != "") || (rqualfile != "")) && ((ffastafile == "") || (rfastafile == ""))) {
280 abort = true; m->mothurOut("[ERROR]: If you provide use the rqfile or fqfile file, you must provide the ffasta and rfasta parameters.\n");
283 oligosfile = validParameter.validFile(parameters, "oligos", true);
284 if (oligosfile == "not found") { oligosfile = ""; }
285 else if(oligosfile == "not open") { abort = true; }
286 else { m->setOligosFile(oligosfile); }
288 findexfile = validParameter.validFile(parameters, "findex", true);
289 if (findexfile == "not found") { findexfile = ""; }
290 else if(findexfile == "not open") { abort = true; }
292 rindexfile = validParameter.validFile(parameters, "rindex", true);
293 if (rindexfile == "not found") { rindexfile = ""; }
294 else if(rindexfile == "not open") { abort = true; }
296 if ((rindexfile != "") || (findexfile != "")) {
297 if (oligosfile == ""){
298 oligosfile = m->getOligosFile();
299 if (oligosfile != "") { m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter.\n"); }
301 m->mothurOut("You need to provide an oligos file if you are going to use an index file.\n"); abort = true;
305 //can only use an index file with the fastq parameters not fasta and qual
306 if ((ffastafile != "") || (rfastafile != "")) {
307 m->mothurOut("[ERROR]: You can only use an index file with the fastq parameters or the file option.\n"); abort = true;
311 //if the user changes the output directory command factory will send this info to us in the output parameter
312 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
317 //check for optional parameter and set defaults
318 // ...at some point should added some additional type checking...
320 temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; }
321 m->mothurConvert(temp, match);
323 temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; }
324 m->mothurConvert(temp, misMatch);
325 if (misMatch > 0) { m->mothurOut("[ERROR]: mismatch must be negative.\n"); abort=true; }
327 temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; }
328 m->mothurConvert(temp, gapOpen);
329 if (gapOpen > 0) { m->mothurOut("[ERROR]: gapopen must be negative.\n"); abort=true; }
331 temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; }
332 m->mothurConvert(temp, gapExtend);
333 if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
335 temp = validParameter.validFile(parameters, "insert", false); if (temp == "not found"){ temp = "20"; }
336 m->mothurConvert(temp, insert);
337 if ((insert < 0) || (insert > 40)) { m->mothurOut("[ERROR]: insert must be between 0 and 40.\n"); abort=true; }
339 temp = validParameter.validFile(parameters, "deltaq", false); if (temp == "not found"){ temp = "6"; }
340 m->mothurConvert(temp, deltaq);
342 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
343 m->setProcessors(temp);
344 m->mothurConvert(temp, processors);
346 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
347 m->mothurConvert(temp, bdiffs);
349 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
350 m->mothurConvert(temp, pdiffs);
352 // temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
353 // m->mothurConvert(temp, ldiffs);
356 // temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
357 // m->mothurConvert(temp, sdiffs);
360 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
361 m->mothurConvert(temp, tdiffs);
363 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } //+ ldiffs + sdiffs;
365 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
366 allFiles = m->isTrue(temp);
369 temp = validParameter.validFile(parameters, "trimoverlap", false); if (temp == "not found") { temp = "F"; }
370 trimOverlap = m->isTrue(temp);
372 align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; }
373 if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; }
375 format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "illumina1.8+"; }
377 if ((format != "sanger") && (format != "illumina") && (format != "illumina1.8+") && (format != "solexa")) {
378 m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa, illumina1.8+ and illumina, aborting." ); m->mothurOutEndLine();
382 temp = validParameter.validFile(parameters, "checkorient", false); if (temp == "not found") { temp = "F"; }
383 reorient = m->isTrue(temp);
385 //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference.
386 for (int i = -64; i < 65; i++) {
387 char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499));
388 convertTable.push_back(temp);
393 catch(exception& e) {
394 m->errorOut(e, "MakeContigsCommand", "MakeContigsCommand");
398 //**********************************************************************************************************************
399 int MakeContigsCommand::execute(){
401 if (abort == true) { if (calledHelp) { return 0; } return 2; }
403 //read ffastq and rfastq files creating fasta and qual files.
404 //this function will create a forward and reverse, fasta and qual files for each processor.
405 //files has an entry for each processor. files[i][0] = forwardFasta, files[i][1] = forwardQual, files[i][2] = reverseFasta, files[i][3] = reverseQual. filesToProcess is for each filepair in the file parameter file. for ffastq and rfastq this will be size 1.
406 unsigned long int numReads = 0;
407 int start = time(NULL);
409 m->mothurOut("Reading fastq data...\n");
410 vector < vector< vector<string> > > filesToProcess = preProcessData(numReads);
411 m->mothurOut("Done.\n");
413 if (m->control_pressed) { return 0; }
415 map<string, string> cvars;
416 string compOutputDir = outputDir;
417 if (outputDir == "") { compOutputDir = m->hasPath(file); }
418 cvars["[filename]"] = compOutputDir + m->getRootName(m->getSimpleName(file));
420 string compositeGroupFile = getOutputFileName("group",cvars);
421 cvars["[tag]"] = "trim";
422 string compositeFastaFile = getOutputFileName("fasta",cvars);
423 cvars["[tag]"] = "scrap";
424 string compositeScrapFastaFile = getOutputFileName("fasta",cvars);
426 string compositeMisMatchFile = getOutputFileName("report",cvars);
428 if (filesToProcess.size() > 1) { //clear files for append below
429 ofstream outCTFasta, outCTQual, outCSFasta, outCSQual, outCMisMatch;
430 m->openOutputFile(compositeFastaFile, outCTFasta); outCTFasta.close();
431 m->openOutputFile(compositeScrapFastaFile, outCSFasta); outCSFasta.close();
432 m->openOutputFile(compositeMisMatchFile, outCMisMatch); outCMisMatch.close();
433 outputNames.push_back(compositeFastaFile); outputTypes["fasta"].push_back(compositeFastaFile);
434 outputNames.push_back(compositeMisMatchFile); outputTypes["report"].push_back(compositeMisMatchFile);
435 outputNames.push_back(compositeScrapFastaFile); outputTypes["fasta"].push_back(compositeScrapFastaFile);
438 map<string, int> totalGroupCounts;
440 for (int l = 0; l < filesToProcess.size(); l++) {
442 m->mothurOut("\n>>>>>\tProcessing " + filesToProcess[l][0][0] + " (file " + toString(l+1) + " of " + toString(filesToProcess.size()) + ")\t<<<<<\n");
446 vector<vector<string> > fastaFileNames;
447 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
448 createOligosGroup = false;
449 oligos = new Oligos();
450 numBarcodes = 0; numFPrimers= 0; numLinkers= 0; numSpacers = 0; numRPrimers = 0;
451 string outputGroupFileName;
452 map<string, string> variables;
453 string thisOutputDir = outputDir;
454 if (outputDir == "") { thisOutputDir = m->hasPath(filesToProcess[l][0][0]); }
455 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(filesToProcess[l][0][0]));
456 variables["[tag]"] = "";
457 if(oligosfile != ""){ createOligosGroup = getOligos(fastaFileNames, variables["[filename]"], uniqueFastaNames); }
458 if (createOligosGroup || createFileGroup) {
459 outputGroupFileName = getOutputFileName("group",variables);
462 //give group in file file precedence
463 if (createFileGroup) { createOligosGroup = false; }
465 variables["[tag]"] = "trim";
466 string outFastaFile = getOutputFileName("fasta",variables);
467 variables["[tag]"] = "scrap";
468 string outScrapFastaFile = getOutputFileName("fasta",variables);
469 variables["[tag]"] = "";
470 string outMisMatchFile = getOutputFileName("report",variables);
472 m->mothurOut("Making contigs...\n");
473 createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames, l);
475 //remove temp fasta and qual files
476 for (int i = 0; i < processors; i++) { for(int j = 0; j < filesToProcess[l][i].size(); j++) { m->mothurRemove(filesToProcess[l][i][j]); } }
478 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete oligos; return 0; }
481 // so we don't add the same groupfile multiple times
482 map<string, string>::iterator it;
483 set<string> namesToRemove;
484 for(int i=0;i<fastaFileNames.size();i++){
485 for(int j=0;j<fastaFileNames[0].size();j++){
486 if (fastaFileNames[i][j] != "") {
487 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
488 if(m->isBlank(fastaFileNames[i][j])){
489 m->mothurRemove(fastaFileNames[i][j]);
490 namesToRemove.insert(fastaFileNames[i][j]);
491 uniqueFastaNames.erase(fastaFileNames[i][j]); //remove from list for group file print
498 //remove names for outputFileNames, just cleans up the output
499 vector<string> outputNames2;
500 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
501 outputNames = outputNames2;
503 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
505 m->openInputFile(it->first, in);
508 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(it->first));
509 string thisGroupName = getOutputFileName("group",variables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
510 m->openOutputFile(thisGroupName, out);
513 if (m->control_pressed) { break; }
515 Sequence currSeq(in); m->gobble(in);
516 out << currSeq.getName() << '\t' << it->second << endl;
523 if (createFileGroup || createOligosGroup) {
525 m->openOutputFile(outputGroupFileName, outGroup);
526 for (map<string, string>::iterator itGroup = groupMap.begin(); itGroup != groupMap.end(); itGroup++) {
527 outGroup << itGroup->first << '\t' << itGroup->second << endl;
532 if (filesToProcess.size() > 1) { //merge into large combo files
533 if (createFileGroup || createOligosGroup) {
536 m->openOutputFile(compositeGroupFile, outCGroup); outCGroup.close();
537 outputNames.push_back(compositeGroupFile); outputTypes["group"].push_back(compositeGroupFile);
539 m->appendFiles(outputGroupFileName, compositeGroupFile);
540 if (!allFiles) { m->mothurRemove(outputGroupFileName); }
541 else { outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); }
543 for (map<string, int>::iterator itGroups = groupCounts.begin(); itGroups != groupCounts.end(); itGroups++) {
544 map<string, int>::iterator itTemp = totalGroupCounts.find(itGroups->first);
545 if (itTemp == totalGroupCounts.end()) { totalGroupCounts[itGroups->first] = itGroups->second; } //new group create it in totalGroups
546 else { itTemp->second += itGroups->second; } //existing group, update total
549 if (l == 0) { m->appendFiles(outMisMatchFile, compositeMisMatchFile); }
550 else { m->appendFilesWithoutHeaders(outMisMatchFile, compositeMisMatchFile); }
551 m->appendFiles(outFastaFile, compositeFastaFile);
552 m->appendFiles(outScrapFastaFile, compositeScrapFastaFile);
554 m->mothurRemove(outMisMatchFile);
555 m->mothurRemove(outFastaFile);
556 m->mothurRemove(outScrapFastaFile);
558 outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
559 outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile);
560 outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile);
563 totalGroupCounts = groupCounts;
564 outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
565 outputNames.push_back(outScrapFastaFile); outputTypes["fasta"].push_back(outScrapFastaFile);
566 outputNames.push_back(outMisMatchFile); outputTypes["report"].push_back(outMisMatchFile);
567 if (createFileGroup || createOligosGroup) {
568 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
571 m->mothurOut("Done.\n");
575 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(numReads) + " sequences.\n");
577 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
579 //output group counts
580 m->mothurOutEndLine();
582 if (totalGroupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
583 for (map<string, int>::iterator it = totalGroupCounts.begin(); it != totalGroupCounts.end(); it++) {
584 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
586 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
588 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
590 string currentFasta = "";
591 itTypes = outputTypes.find("fasta");
592 if (itTypes != outputTypes.end()) {
593 if ((itTypes->second).size() != 0) { currentFasta = (itTypes->second)[0]; m->setFastaFile(currentFasta); }
596 string currentGroup = "";
597 itTypes = outputTypes.find("group");
598 if (itTypes != outputTypes.end()) {
599 if ((itTypes->second).size() != 0) { currentGroup = (itTypes->second)[0]; m->setGroupFile(currentGroup); }
602 //output files created by command
603 m->mothurOutEndLine();
604 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
605 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
606 m->mothurOutEndLine();
610 catch(exception& e) {
611 m->errorOut(e, "MakeContigsCommand", "execute");
615 //**********************************************************************************************************************
616 vector< vector< vector<string> > > MakeContigsCommand::preProcessData(unsigned long int& numReads) {
618 vector< vector< vector<string> > > filesToProcess;
620 if (ffastqfile != "") { //reading one file
621 vector< vector<string> > files = readFastqFiles(numReads, ffastqfile, rfastqfile, findexfile, rindexfile);
622 //adjust for really large processors or really small files
623 if (numReads == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; }
624 if (numReads < processors) {
625 for (int i = numReads; i < processors; i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } files[i].clear(); }
626 files.resize(numReads);
627 processors = numReads;
629 filesToProcess.push_back(files);
630 }else if (file != "") { //reading multiple files
631 //return only valid pairs
632 vector< vector<string> > filePairsToProcess = readFileNames(file);
634 if (m->control_pressed) { return filesToProcess; }
636 if (filePairsToProcess.size() != 0) {
637 for (int i = 0; i < filePairsToProcess.size(); i++) {
639 if (m->control_pressed) { for (int l = 0; l < filesToProcess.size(); l++) { for (int k = 0; k < filesToProcess[l].size(); k++) { for(int j = 0; j < filesToProcess[l][k].size(); j++) { m->mothurRemove(filesToProcess[l][k][j]); } filesToProcess[l][k].clear(); } return filesToProcess; } }
641 unsigned long int thisFilesReads;
642 vector< vector<string> > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1], filePairsToProcess[i][2], filePairsToProcess[i][3]);
644 //adjust for really large processors or really small files
645 if (thisFilesReads < processors) {
646 m->mothurOut("[ERROR]: " + filePairsToProcess[i][0] + " has less than " + toString(processors) + " good reads, skipping\n");
647 for (int k = 0; k < files.size(); k++) { for(int j = 0; j < files[k].size(); j++) { m->mothurRemove(files[k][j]); } files[k].clear(); }
648 //remove from file2Group if necassary
649 map<int, string> cFile2Group;
650 for (map<int, string>::iterator it = file2Group.begin(); it != file2Group.end(); it++) {
651 if ((it->first) < i) { cFile2Group[it->first] = it->second; }
652 else if ((it->first) == i) { } //do nothing, we removed files for i
653 else { cFile2Group[(it->first-1)] = it->second; } //adjust files because i was removed
655 file2Group = cFile2Group;
657 filesToProcess.push_back(files);
658 numReads += thisFilesReads;
662 if (numReads == 0) { m->control_pressed = true; }
664 }else if (ffastafile != "") {
665 vector< vector<string> > files = readFastaFiles(numReads, ffastafile, rfastafile);
666 //adjust for really large processors or really small files
667 if (numReads == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; }
668 if (numReads < processors) {
669 for (int i = numReads; i < processors; i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } files[i].clear(); }
670 files.resize(numReads);
671 processors = numReads;
673 filesToProcess.push_back(files);
674 }else { m->control_pressed = true; } //should not get here
676 return filesToProcess;
678 catch(exception& e) {
679 m->errorOut(e, "MakeContigsCommand", "preProcessData");
683 //**********************************************************************************************************************
684 int MakeContigsCommand::createProcesses(vector< vector<string> > files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector<vector<string> > fastaFileNames, int index) {
687 vector<int> processIDS;
689 map<int, string>::iterator it = file2Group.find(index);
690 if (it != file2Group.end()) { group = it->second; }
692 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
695 //loop through and create all the processes you want
696 while (process != processors-1) {
700 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
703 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
708 for(int i=0;i<tempFASTAFileNames.size();i++){
709 for(int j=0;j<tempFASTAFileNames[i].size();j++){
710 if (tempFASTAFileNames[i][j] != "") {
711 tempFASTAFileNames[i][j] += m->mothurGetpid(process) + ".temp";
712 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
718 num = driver(files[process],
719 outputFasta + m->mothurGetpid(process) + ".temp",
720 outputScrapFasta + m->mothurGetpid(process) + ".temp",
721 outputMisMatches + m->mothurGetpid(process) + ".temp",
722 tempFASTAFileNames, process, group);
724 //pass groupCounts to parent
726 string tempFile = m->mothurGetpid(process) + ".num.temp";
727 m->openOutputFile(tempFile, out);
729 if (createFileGroup || createOligosGroup) {
730 out << groupCounts.size() << endl;
732 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
733 out << it->first << '\t' << it->second << endl;
736 out << groupMap.size() << endl;
737 for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
738 out << it->first << '\t' << it->second << endl;
745 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
746 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
752 m->openOutputFile(outputFasta, temp); temp.close();
753 m->openOutputFile(outputScrapFasta, temp); temp.close();
756 num = driver(files[processors-1], outputFasta, outputScrapFasta, outputMisMatches, fastaFileNames, processors-1, group);
758 //force parent to wait until all the processes are done
759 for (int i=0;i<processIDS.size();i++) {
760 int temp = processIDS[i];
764 for (int i = 0; i < processIDS.size(); i++) {
766 string tempFile = toString(processIDS[i]) + ".num.temp";
767 m->openInputFile(tempFile, in);
769 in >> tempNum; num += tempNum; m->gobble(in);
771 if (createFileGroup || createOligosGroup) {
773 in >> tempNum; m->gobble(in);
776 for (int j = 0; j < tempNum; j++) {
778 in >> group >> groupNum; m->gobble(in);
780 map<string, int>::iterator it = groupCounts.find(group);
781 if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
782 else { groupCounts[it->first] += groupNum; }
785 in >> tempNum; m->gobble(in);
787 for (int j = 0; j < tempNum; j++) {
788 string group, seqName;
789 in >> seqName >> group; m->gobble(in);
791 map<string, string>::iterator it = groupMap.find(seqName);
792 if (it == groupMap.end()) { groupMap[seqName] = group; }
793 else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
797 in.close(); m->mothurRemove(tempFile);
801 //////////////////////////////////////////////////////////////////////////////////////////////////////
802 //Windows version shared memory, so be careful when passing variables through the contigsData struct.
803 //Above fork() will clone, so memory is separate, but that's not the case with windows,
804 //////////////////////////////////////////////////////////////////////////////////////////////////////
806 vector<contigsData*> pDataArray;
807 DWORD dwThreadIdArray[processors-1];
808 HANDLE hThreadArray[processors-1];
810 //Create processor worker threads.
811 for( int h=0; h<processors-1; h++ ){
812 string extension = "";
813 if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
814 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
819 for(int i=0;i<tempFASTAFileNames.size();i++){
820 for(int j=0;j<tempFASTAFileNames[i].size();j++){
821 if (tempFASTAFileNames[i][j] != "") {
822 tempFASTAFileNames[i][j] += extension;
823 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
829 contigsData* tempcontig = new contigsData(group, files[h], (outputFasta + extension), (outputScrapFasta + extension), (outputMisMatches + extension), align, m, match, misMatch, gapOpen, gapExtend, insert, deltaq, tempFASTAFileNames, oligosfile, reorient, pdiffs, bdiffs, tdiffs, createOligosGroup, createFileGroup, allFiles, trimOverlap, h);
830 pDataArray.push_back(tempcontig);
832 hThreadArray[h] = CreateThread(NULL, 0, MyContigsThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
835 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
839 string extension = toString(processors-1) + ".temp";
841 for(int i=0;i<tempFASTAFileNames.size();i++){
842 for(int j=0;j<tempFASTAFileNames[i].size();j++){
843 if (tempFASTAFileNames[i][j] != "") {
844 tempFASTAFileNames[i][j] += extension;
845 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
853 m->openOutputFile(outputFasta, temp); temp.close();
854 m->openOutputFile(outputScrapFasta, temp); temp.close();
857 processIDS.push_back(processors-1);
858 num = driver(files[processors-1], (outputFasta+ toString(processors-1) + ".temp"), (outputScrapFasta+ toString(processors-1) + ".temp"), (outputMisMatches+ toString(processors-1) + ".temp"), tempFASTAFileNames, processors-1, group);
860 //Wait until all threads have terminated.
861 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
863 //Close all thread handles and free memory allocations.
864 for(int i=0; i < pDataArray.size(); i++){
865 num += pDataArray[i]->count;
866 if (!pDataArray[i]->done) {
867 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of sequences assigned to it, quitting. \n"); m->control_pressed = true;
869 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
870 map<string, int>::iterator it2 = groupCounts.find(it->first);
871 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
872 else { groupCounts[it->first] += it->second; }
874 for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
875 map<string, string>::iterator it2 = groupMap.find(it->first);
876 if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
877 else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
879 CloseHandle(hThreadArray[i]);
880 delete pDataArray[i];
885 for (int i = 0; i < processIDS.size(); i++) {
886 m->appendFiles((outputFasta + toString(processIDS[i]) + ".temp"), outputFasta);
887 m->mothurRemove((outputFasta + toString(processIDS[i]) + ".temp"));
889 m->appendFiles((outputScrapFasta + toString(processIDS[i]) + ".temp"), outputScrapFasta);
890 m->mothurRemove((outputScrapFasta + toString(processIDS[i]) + ".temp"));
892 m->appendFilesWithoutHeaders((outputMisMatches + toString(processIDS[i]) + ".temp"), outputMisMatches);
893 m->mothurRemove((outputMisMatches + toString(processIDS[i]) + ".temp"));
896 for(int j=0;j<fastaFileNames.size();j++){
897 for(int k=0;k<fastaFileNames[j].size();k++){
898 if (fastaFileNames[j][k] != "") {
899 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
900 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
909 catch(exception& e) {
910 m->errorOut(e, "MakeContigsCommand", "createProcesses");
914 //**********************************************************************************************************************
915 int MakeContigsCommand::driver(vector<string> files, string outputFasta, string outputScrapFasta, string outputMisMatches, vector<vector<string> > fastaFileNames, int process, string group){
918 Alignment* alignment;
919 if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
920 else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
923 string thisffastafile = files[0];
924 string thisfqualfile = files[1];
925 string thisrfastafile = files[2];
926 string thisrqualfile = files[3];
927 string thisfindexfile = files[4];
928 string thisrindexfile = files[5];
930 if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
932 ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
933 ofstream outFasta, outMisMatch, outScrapFasta;
934 m->openInputFile(thisffastafile, inFFasta);
935 m->openInputFile(thisrfastafile, inRFasta);
936 if (thisfqualfile != "") {
937 m->openInputFile(thisfqualfile, inFQual);
938 m->openInputFile(thisrqualfile, inRQual);
941 if (thisfindexfile != "") { m->openInputFile(thisfindexfile, inFIndex); }
942 if (thisrindexfile != "") { m->openInputFile(thisrindexfile, inRIndex); }
944 m->openOutputFile(outputFasta, outFasta);
945 m->openOutputFile(outputScrapFasta, outScrapFasta);
946 m->openOutputFile(outputMisMatches, outMisMatch);
947 outMisMatch << "Name\tLength\tOverlap_Length\tOverlap_Start\tOverlap_End\tMisMatches\tNum_Ns\n";
949 TrimOligos trimOligos(pdiffs, bdiffs, 0, 0, oligos->getPairedPrimers(), oligos->getPairedBarcodes());
951 TrimOligos* rtrimOligos = NULL;
953 rtrimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, oligos->getReorientedPairedPrimers(), oligos->getReorientedPairedBarcodes()); numBarcodes = oligos->getReorientedPairedBarcodes().size();
956 while ((!inFFasta.eof()) && (!inRFasta.eof())) {
958 if (m->control_pressed) { break; }
961 string trashCode = "";
962 int currentSeqsDiffs = 0;
964 //read seqs and quality info
965 Sequence fSeq(inFFasta); m->gobble(inFFasta);
966 Sequence rSeq(inRFasta); m->gobble(inRFasta);
967 QualityScores* fQual = NULL; QualityScores* rQual = NULL;
968 if (thisfqualfile != "") {
969 fQual = new QualityScores(inFQual); m->gobble(inFQual);
970 rQual = new QualityScores(inRQual); m->gobble(inRQual);
972 Sequence findexBarcode("findex", "NONE"); Sequence rindexBarcode("rindex", "NONE");
973 if (thisfindexfile != "") {
974 Sequence temp(inFIndex); m->gobble(inFIndex);
975 findexBarcode.setAligned(temp.getAligned());
978 if (thisrindexfile != "") {
979 Sequence temp(inRIndex); m->gobble(inRIndex);
980 rindexBarcode.setAligned(temp.getAligned());
983 int barcodeIndex = 0;
985 Sequence savedFSeq(fSeq.getName(), fSeq.getAligned()); Sequence savedRSeq(rSeq.getName(), rSeq.getAligned());
986 Sequence savedFindex(findexBarcode.getName(), findexBarcode.getAligned()); Sequence savedRIndex(rindexBarcode.getName(), rindexBarcode.getAligned());
987 QualityScores* savedFQual = NULL; QualityScores* savedRQual = NULL;
988 if (thisfqualfile != "") {
989 savedFQual = new QualityScores(fQual->getName(), fQual->getQualityScores());
990 savedRQual = new QualityScores(rQual->getName(), rQual->getQualityScores());
993 if(numBarcodes != 0){
994 if (thisfqualfile != "") {
995 if ((thisfindexfile != "") || (thisrindexfile != "")) {
996 success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
998 success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
1001 success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
1003 if(success > bdiffs) { trashCode += 'b'; }
1004 else{ currentSeqsDiffs += success; }
1007 if(numFPrimers != 0){
1008 if (thisfqualfile != "") {
1009 success = trimOligos.stripForward(fSeq, rSeq, *fQual, *rQual, primerIndex);
1011 success = trimOligos.stripForward(fSeq, rSeq, primerIndex);
1013 if(success > pdiffs) { trashCode += 'f'; }
1014 else{ currentSeqsDiffs += success; }
1017 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
1019 if (reorient && (trashCode != "")) { //if you failed and want to check the reverse
1020 int thisSuccess = 0;
1021 string thisTrashCode = "";
1022 int thisCurrentSeqsDiffs = 0;
1024 int thisBarcodeIndex = 0;
1025 int thisPrimerIndex = 0;
1027 if(numBarcodes != 0){
1028 if (thisfqualfile != "") {
1029 if ((thisfindexfile != "") || (thisrindexfile != "")) {
1030 thisSuccess = rtrimOligos->stripBarcode(savedFindex, savedRIndex, *savedFQual, *savedRQual, thisBarcodeIndex);
1032 thisSuccess = rtrimOligos->stripBarcode(savedFSeq, savedRSeq, *savedFQual, *savedRQual, thisBarcodeIndex);
1035 thisSuccess = rtrimOligos->stripBarcode(savedFSeq, savedRSeq, thisBarcodeIndex);
1037 if(thisSuccess > bdiffs) { thisTrashCode += 'b'; }
1038 else{ thisCurrentSeqsDiffs += thisSuccess; }
1041 if(numFPrimers != 0){
1042 if (thisfqualfile != "") {
1043 thisSuccess = rtrimOligos->stripForward(savedFSeq, savedRSeq, *savedFQual, *savedRQual, thisPrimerIndex);
1045 thisSuccess = rtrimOligos->stripForward(savedFSeq, savedRSeq, thisPrimerIndex);
1047 if(thisSuccess > pdiffs) { thisTrashCode += 'f'; }
1048 else{ thisCurrentSeqsDiffs += thisSuccess; }
1051 if (thisCurrentSeqsDiffs > tdiffs) { thisTrashCode += 't'; }
1053 if (thisTrashCode == "") {
1054 trashCode = thisTrashCode;
1055 success = thisSuccess;
1056 currentSeqsDiffs = thisCurrentSeqsDiffs;
1057 barcodeIndex = thisBarcodeIndex;
1058 primerIndex = thisPrimerIndex;
1059 savedFSeq.reverseComplement();
1060 savedRSeq.reverseComplement();
1061 fSeq.setAligned(savedFSeq.getAligned());
1062 rSeq.setAligned(savedRSeq.getAligned());
1063 if(thisfqualfile != ""){
1064 savedFQual->flipQScores(); savedRQual->flipQScores();
1065 fQual->setScores(savedFQual->getScores()); rQual->setScores(savedRQual->getScores());
1067 }else { trashCode += "(" + thisTrashCode + ")"; }
1071 //flip the reverse reads
1072 rSeq.reverseComplement();
1073 if (thisfqualfile != "") { rQual->flipQScores(); }
1076 alignment->align(fSeq.getUnaligned(), rSeq.getUnaligned());
1077 map<int, int> ABaseMap = alignment->getSeqAAlnBaseMap();
1078 map<int, int> BBaseMap = alignment->getSeqBAlnBaseMap();
1079 fSeq.setAligned(alignment->getSeqAAln());
1080 rSeq.setAligned(alignment->getSeqBAln());
1081 int length = fSeq.getAligned().length();
1083 //traverse alignments merging into one contiguous seq
1085 int numMismatches = 0;
1086 string seq1 = fSeq.getAligned();
1087 string seq2 = rSeq.getAligned();
1088 vector<int> scores1, scores2;
1089 if (thisfqualfile != "") {
1090 scores1 = fQual->getQualityScores();
1091 scores2 = rQual->getQualityScores();
1092 delete fQual; delete rQual; delete savedFQual; delete savedRQual;
1095 // if (num < 5) { cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; }
1096 int overlapStart = fSeq.getStartPos();
1097 int seq2Start = rSeq.getStartPos();
1099 //bigger of the 2 starting positions is the location of the overlapping start
1100 if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
1101 overlapStart = seq2Start;
1102 for (int i = 0; i < overlapStart; i++) { contig += seq1[i]; }
1103 }else { //seq1 starts later so take from 0 to overlapStart from seq2
1104 for (int i = 0; i < overlapStart; i++) { contig += seq2[i]; }
1107 int seq1End = fSeq.getEndPos();
1108 int seq2End = rSeq.getEndPos();
1109 int overlapEnd = seq1End;
1110 if (seq2End < overlapEnd) { overlapEnd = seq2End; } //smallest end position is where overlapping ends
1112 int oStart = contig.length();
1113 //cout << fSeq.getAligned() << endl; cout << rSeq.getAligned() << endl;
1114 for (int i = overlapStart; i < overlapEnd; i++) {
1115 //cout << seq1[i] << ' ' << seq2[i] << ' ' << scores1[ABaseMap[i]] << ' ' << scores2[BBaseMap[i]] << endl;
1116 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
1118 }else if (((seq1[i] == '.') || (seq1[i] == '-')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //seq1 is a gap and seq2 is a base, choose seq2, unless quality score for base is below insert. In that case eliminate base
1119 if (thisfqualfile != "") {
1120 if (scores2[BBaseMap[i]] <= insert) { } //
1121 else { contig += seq2[i]; }
1122 }else { contig += seq2[i]; } //with no quality info, then we keep it?
1123 }else if (((seq2[i] == '.') || (seq2[i] == '-')) && ((seq1[i] != '-') && (seq1[i] != '.'))) { //seq2 is a gap and seq1 is a base, choose seq1, unless quality score for base is below insert. In that case eliminate base
1124 if (thisfqualfile != "") {
1125 if (scores1[ABaseMap[i]] <= insert) { } //
1126 else { contig += seq1[i]; }
1127 }else { contig += seq1[i]; } //with no quality info, then we keep it?
1128 }else if (((seq1[i] != '-') && (seq1[i] != '.')) && ((seq2[i] != '-') && (seq2[i] != '.'))) { //both bases choose one with better quality
1129 if (thisfqualfile != "") {
1130 if (abs(scores1[ABaseMap[i]] - scores2[BBaseMap[i]]) >= deltaq) { //is the difference in qual scores >= deltaq, if yes choose base with higher score
1132 if (scores1[ABaseMap[i]] < scores2[BBaseMap[i]]) { c = seq2[i]; }
1134 }else { //if no, base becomes n
1138 }else { numMismatches++; } //cant decide, so eliminate and mark as mismatch
1139 }else { //should never get here
1140 m->mothurOut("[ERROR]: case I didn't think of seq1 = " + toString(seq1[i]) + " and seq2 = " + toString(seq2[i]) + "\n");
1143 int oend = contig.length();
1144 if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
1145 for (int i = overlapEnd; i < length; i++) { contig += seq2[i]; }
1146 }else { //seq2 ends before seq1 so take from overlap to length from seq1
1147 for (int i = overlapEnd; i < length; i++) { contig += seq1[i]; }
1149 //cout << contig << endl;
1151 if (trimOverlap) { contig = contig.substr(overlapStart-1, oend-oStart); if (contig.length() == 0) { trashCode += "l"; } }
1153 if(trashCode.length() == 0){
1154 bool ignore = false;
1156 if (m->debug) { m->mothurOut(fSeq.getName()); }
1158 if (createOligosGroup) {
1159 string thisGroup = oligos->getGroupName(barcodeIndex, primerIndex);
1160 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
1162 int pos = thisGroup.find("ignore");
1163 if (pos == string::npos) {
1164 groupMap[fSeq.getName()] = thisGroup;
1166 map<string, int>::iterator it = groupCounts.find(thisGroup);
1167 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
1168 else { groupCounts[it->first] ++; }
1169 }else { ignore = true; }
1170 }else if (createFileGroup) {
1171 int pos = group.find("ignore");
1172 if (pos == string::npos) {
1173 groupMap[fSeq.getName()] = group;
1175 map<string, int>::iterator it = groupCounts.find(group);
1176 if (it == groupCounts.end()) { groupCounts[group] = 1; }
1177 else { groupCounts[it->first] ++; }
1178 }else { ignore = true; }
1180 if (m->debug) { m->mothurOut("\n"); }
1182 if(allFiles && !ignore){
1184 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
1185 output << ">" << fSeq.getName() << endl << contig << endl;
1190 outFasta << ">" << fSeq.getName() << endl << contig << endl;
1192 for (int i = 0; i < contig.length(); i++) { if (contig[i] == 'N') { numNs++; } }
1193 outMisMatch << fSeq.getName() << '\t' << contig.length() << '\t' << (oend-oStart) << '\t' << oStart << '\t' << oend << '\t' << numMismatches << '\t' << numNs << endl;
1196 outScrapFasta << ">" << fSeq.getName() << " | " << trashCode << endl << contig << endl;
1201 if((num) % 1000 == 0){ m->mothurOutJustToScreen(toString(num)); m->mothurOutEndLine(); }
1205 if((num) % 1000 != 0){ m->mothurOut(toString(num)); m->mothurOutEndLine(); }
1210 outScrapFasta.close();
1211 outMisMatch.close();
1212 if (thisfqualfile != "") {
1217 if (reorient) { delete rtrimOligos; }
1219 if (m->control_pressed) { m->mothurRemove(outputFasta); m->mothurRemove(outputScrapFasta);m->mothurRemove(outputMisMatches); }
1223 catch(exception& e) {
1224 m->errorOut(e, "MakeContigsCommand", "driver");
1228 //**********************************************************************************************************************
1229 vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq, string findex, string rindex){
1231 vector< vector<string> > files;
1232 //maps processors number to file pointer
1233 map<int, vector<ofstream*> > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual, tempfiles[4] = forwardIndex, [4] = forwardReverse
1234 map<int, vector<ofstream*> >::iterator it;
1236 //create files to write to
1237 for (int i = 0; i < processors; i++) {
1238 vector<ofstream*> temp;
1239 ofstream* outFF = new ofstream; temp.push_back(outFF);
1240 ofstream* outFQ = new ofstream; temp.push_back(outFQ);
1241 ofstream* outRF = new ofstream; temp.push_back(outRF);
1242 ofstream* outRQ = new ofstream; temp.push_back(outRQ);
1243 ofstream* outFI = new ofstream; temp.push_back(outFI);
1244 ofstream* outRI = new ofstream; temp.push_back(outRI);
1245 tempfiles[i] = temp;
1247 vector<string> names;
1248 string thisOutputDir = outputDir;
1249 if (outputDir == "") { thisOutputDir = m->hasPath(ffastq); }
1250 string ffastafilename = thisOutputDir + m->getRootName(m->getSimpleName(ffastq)) + toString(i) + "ffastatemp";
1251 string rfastafilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rfastatemp";
1252 string fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(ffastq)) + toString(i) + "fqualtemp";
1253 string rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rqualtemp";
1254 string findexfilename = ""; string rindexfilename = "";
1255 noneOk = false; //flag to oligos file read that its okay to allow for non paired barcodes
1256 if (findex != "") { findexfilename = thisOutputDir + m->getRootName(m->getSimpleName(findex)) + toString(i) + "findextemp"; m->openOutputFile(findexfilename, *outFI); noneOk = true; }
1257 if (rindex != "") { rindexfilename = thisOutputDir + m->getRootName(m->getSimpleName(rindex)) + toString(i) + "rindextemp"; m->openOutputFile(rindexfilename, *outRI); noneOk = true; }
1258 names.push_back(ffastafilename); names.push_back(fqualfilename);
1259 names.push_back(rfastafilename); names.push_back(rqualfilename);
1260 names.push_back(findexfilename); names.push_back(rindexfilename);
1261 files.push_back(names);
1263 m->openOutputFile(ffastafilename, *outFF);
1264 m->openOutputFile(rfastafilename, *outRF);
1265 m->openOutputFile(fqualfilename, *outFQ);
1266 m->openOutputFile(rqualfilename, *outRQ);
1269 if (m->control_pressed) {
1270 //close files, delete ofstreams
1271 for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
1273 for (int i = 0; i < files.size(); i++) {
1274 for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } }
1279 m->openInputFile(ffastq, inForward);
1282 m->openInputFile(rfastq, inReverse);
1284 ifstream infIndex, inrIndex;
1285 bool findexIsGood = false;
1286 bool rindexIsGood = false;
1287 if (findex != "") { m->openInputFile(findex, infIndex); findexIsGood = true; }
1288 if (rindex != "") { m->openInputFile(rindex, inrIndex); rindexIsGood = true; }
1291 map<string, fastqRead> uniques;
1292 map<string, fastqRead> iUniques;
1293 map<string, pairFastqRead> pairUniques;
1294 map<string, fastqRead>::iterator itUniques;
1295 while ((!inForward.eof()) || (!inReverse.eof()) || (findexIsGood) || (rindexIsGood)) {
1297 if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; }
1299 //get a read from forward and reverse fastq files
1300 bool ignoref, ignorer, ignorefi, ignoreri;
1301 fastqRead thisFread, thisRread, thisFIread, thisRIread;
1302 if (!inForward.eof()) { thisFread = readFastq(inForward, ignoref); }
1303 else { ignoref = true; }
1304 if (!inReverse.eof()) { thisRread = readFastq(inReverse, ignorer); }
1305 else { ignorer = true; }
1306 if (findexIsGood) { thisFIread = readFastq(infIndex, ignorefi); if (infIndex.eof()) { findexIsGood = false; } }
1307 else { ignorefi = true; }
1308 if (rindexIsGood) { thisRIread = readFastq(inrIndex, ignoreri); if (inrIndex.eof()) { rindexIsGood = false; } }
1309 else { ignoreri = true; }
1311 bool allowOne = false;
1312 if ((findex == "") || (rindex == "")) { allowOne = true; }
1313 vector<pairFastqRead> frReads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false);
1314 vector<pairFastqRead> friReads = getReads(ignorefi, ignoreri, thisFIread, thisRIread, iUniques, allowOne);
1316 //add in index info if provided
1317 vector<pairFastqRead> reads = frReads;
1318 if ((findex != "") || (rindex != "")) { reads = mergeReads(frReads, friReads, pairUniques); }
1320 for (int i = 0; i < reads.size(); i++) {
1321 fastqRead fread = reads[i].forward;
1322 fastqRead rread = reads[i].reverse;
1323 fastqRead firead = reads[i].findex;
1324 fastqRead riread = reads[i].rindex;
1326 if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); if (findex != "") { m->mothurOut(toString(count) + '\t' + firead.name + '\n'); } if (rindex != "") { m->mothurOut(toString(count) + '\t' + riread.name + '\n'); } }
1328 //if (checkReads(fread, rread, ffastq, rfastq)) {
1329 if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; }
1331 //if the reads are okay write to output files
1332 int process = count % processors;
1334 *(tempfiles[process][0]) << ">" << fread.name << endl << fread.sequence << endl;
1335 *(tempfiles[process][1]) << ">" << fread.name << endl;
1336 for (int i = 0; i < fread.scores.size(); i++) { *(tempfiles[process][1]) << fread.scores[i] << " "; }
1337 *(tempfiles[process][1]) << endl;
1338 *(tempfiles[process][2]) << ">" << rread.name << endl << rread.sequence << endl;
1339 *(tempfiles[process][3]) << ">" << rread.name << endl;
1340 for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; }
1341 *(tempfiles[process][3]) << endl;
1342 if (findex != "") { *(tempfiles[process][4]) << ">" << firead.name << endl << firead.sequence << endl; }
1343 if (rindex != "") { *(tempfiles[process][5]) << ">" << riread.name << endl << riread.sequence << endl; }
1348 if((count) % 10000 == 0){ m->mothurOutJustToScreen(toString(count)); m->mothurOutEndLine(); }
1353 if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
1355 if (uniques.size() != 0) {
1356 for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) {
1357 if (m->control_pressed) { break; }
1358 m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n");
1360 for (map<string, pairFastqRead>:: iterator it = pairUniques.begin(); it != pairUniques.end(); it++) {
1361 if (m->control_pressed) { break; }
1362 m->mothurOut("[WARNING]: did not find paired read for " + (it->first).substr(1) + ", ignoring.\n");
1364 m->mothurOutEndLine();
1367 //close files, delete ofstreams
1368 for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
1371 if (findex != "") { infIndex.close(); }
1372 if (rindex != "") { inrIndex.close(); }
1376 catch(exception& e) {
1377 m->errorOut(e, "MakeContigsCommand", "readFastqFiles");
1381 //**********************************************************************************************************************
1382 vector< vector<string> > MakeContigsCommand::readFastaFiles(unsigned long int& count, string ffasta, string rfasta){
1384 vector< vector<string> > files;
1385 //maps processors number to file pointer
1386 map<int, vector<ofstream*> > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual
1387 map<int, vector<ofstream*> >::iterator it;
1389 //create files to write to
1390 for (int i = 0; i < processors; i++) {
1391 vector<ofstream*> temp;
1392 ofstream* outFF = new ofstream; temp.push_back(outFF);
1393 ofstream* outFQ = new ofstream; temp.push_back(outFQ);
1394 ofstream* outRF = new ofstream; temp.push_back(outRF);
1395 ofstream* outRQ = new ofstream; temp.push_back(outRQ);
1396 tempfiles[i] = temp;
1398 vector<string> names;
1399 string thisOutputDir = outputDir;
1400 if (outputDir == "") { thisOutputDir = m->hasPath(ffasta); }
1401 string ffastafilename = thisOutputDir + m->getRootName(m->getSimpleName(ffasta)) + toString(i) + "ffastatemp";
1402 string rfastafilename = thisOutputDir + m->getRootName(m->getSimpleName(rfasta)) + toString(i) + "rfastatemp";
1403 string fqualfilename = "";
1404 if (fqualfile != "") { fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(fqualfile)) + toString(i) + "fqual.temp"; m->openOutputFile(fqualfilename, *outFQ); }
1405 string rqualfilename = "";
1406 if (rqualfile != "") { rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rqualfile)) + toString(i) + "rqual.temp"; m->openOutputFile(rqualfilename, *outRQ); }
1407 string findexfilename = ""; string rindexfilename = "";
1408 names.push_back(ffastafilename); names.push_back(fqualfilename);
1409 names.push_back(rfastafilename); names.push_back(rqualfilename);
1410 names.push_back(findexfilename); names.push_back(rindexfilename);
1411 files.push_back(names);
1413 m->openOutputFile(ffastafilename, *outFF);
1414 m->openOutputFile(rfastafilename, *outRF);
1417 if (m->control_pressed) {
1418 //close files, delete ofstreams
1419 for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
1421 for (int i = 0; i < files.size(); i++) {
1422 for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); }
1426 ifstream inForwardFasta;
1427 m->openInputFile(ffasta, inForwardFasta);
1429 ifstream inReverseFasta;
1430 m->openInputFile(rfasta, inReverseFasta);
1432 ifstream inForwardQual, inReverseQual;
1433 if (fqualfile != "") { m->openInputFile(fqualfile, inForwardQual); m->openInputFile(rqualfile, inReverseQual); }
1436 map<string, fastqRead> uniques;
1437 map<string, fastqRead>::iterator itUniques;
1438 while ((!inForwardFasta.eof()) || (!inReverseFasta.eof())) {
1440 if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inReverseFasta.close(); inForwardFasta.close(); if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); } return files; }
1442 //get a reads from forward and reverse fasta files
1443 bool ignoref, ignorer;
1444 fastqRead thisFread, thisRread;
1445 if (!inForwardFasta.eof()) {
1447 Sequence temp(inForwardFasta); m->gobble(inForwardFasta);
1448 thisFread.name = temp.getName();
1449 thisFread.sequence = temp.getUnaligned();
1450 }else { ignoref = true; }
1451 if (!inReverseFasta.eof()) {
1453 Sequence temp(inReverseFasta); m->gobble(inReverseFasta);
1454 thisRread.name = temp.getName();
1455 thisRread.sequence = temp.getUnaligned();
1456 }else { ignorer = true; }
1458 //get qual reads if given
1459 if (fqualfile != "") {
1460 if (!inForwardQual.eof() && !ignoref) {
1461 QualityScores temp(inForwardQual); m->gobble(inForwardQual);
1462 //if forward files dont match ignore read
1463 if (thisFread.name != temp.getName()) { ignoref = true; }
1464 else { thisFread.scores = temp.getQualityScores(); }
1465 }else { ignoref = true; }
1466 if (!inReverseQual.eof() && !ignorer) {
1467 QualityScores temp(inReverseQual); m->gobble(inReverseQual);
1468 //if reverse files dont match ignore read
1469 if (thisRread.name != temp.getName()) { ignorer = true; }
1470 else { thisRread.scores = temp.getQualityScores(); }
1471 }else { ignorer = true; }
1474 vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false);
1476 for (int i = 0; i < reads.size(); i++) {
1477 fastqRead fread = reads[i].forward;
1478 fastqRead rread = reads[i].reverse;
1480 if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); }
1482 // if (checkReads(fread, rread, ffasta, rfasta)) {
1483 if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inReverseFasta.close(); inForwardFasta.close(); if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); } return files; }
1485 //if the reads are okay write to output files
1486 int process = count % processors;
1488 *(tempfiles[process][0]) << ">" << fread.name << endl << fread.sequence << endl;
1489 *(tempfiles[process][2]) << ">" << rread.name << endl << rread.sequence << endl;
1490 if (fqualfile != "") { //if you have quality info, print it
1491 *(tempfiles[process][1]) << ">" << fread.name << endl;
1492 for (int i = 0; i < fread.scores.size(); i++) { *(tempfiles[process][1]) << fread.scores[i] << " "; }
1493 *(tempfiles[process][1]) << endl;
1494 *(tempfiles[process][3]) << ">" << rread.name << endl;
1495 for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; }
1496 *(tempfiles[process][3]) << endl;
1501 if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
1506 if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
1508 if (uniques.size() != 0) {
1509 for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) {
1510 m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n");
1512 m->mothurOutEndLine();
1515 //close files, delete ofstreams
1516 for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
1517 inReverseFasta.close();
1518 inForwardFasta.close();
1519 if (fqualfile != "") { inReverseQual.close(); inReverseQual.close(); }
1523 catch(exception& e) {
1524 m->errorOut(e, "MakeContigsCommand", "readFastaFiles");
1528 //**********************************************************************************************************************
1529 vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool allowOne){
1531 vector<pairFastqRead> reads;
1532 map<string, fastqRead>::iterator itUniques;
1534 if (!ignoref && !ignorer) {
1535 if (forward.name == reverse.name) {
1536 pairFastqRead temp(forward, reverse);
1537 reads.push_back(temp);
1540 //if no match are the names only different by 1 and 2?
1541 string tempFRead = forward.name.substr(0, forward.name.length()-1);
1542 string tempRRead = reverse.name.substr(0, reverse.name.length()-1);
1543 if (tempFRead == tempRRead) {
1544 if ((forward.name[forward.name.length()-1] == '1') && (reverse.name[reverse.name.length()-1] == '2')) {
1545 forward.name = tempFRead;
1546 reverse.name = tempRRead;
1547 pairFastqRead temp(forward, reverse);
1548 reads.push_back(temp);
1554 //look for forward pair
1555 itUniques = uniques.find(forward.name);
1556 if (itUniques != uniques.end()) { //we have the pair for this read
1557 pairFastqRead temp(forward, itUniques->second);
1558 reads.push_back(temp);
1559 uniques.erase(itUniques);
1560 }else { //save this read for later
1561 uniques[forward.name] = forward;
1564 //look for reverse pair
1565 itUniques = uniques.find(reverse.name);
1566 if (itUniques != uniques.end()) { //we have the pair for this read
1567 pairFastqRead temp(itUniques->second, reverse);
1568 reads.push_back(temp);
1569 uniques.erase(itUniques);
1570 }else { //save this read for later
1571 uniques[reverse.name] = reverse;
1576 }else if (!ignoref && ignorer) { //ignore reverse keep forward
1579 pairFastqRead temp(forward, dummy);
1580 reads.push_back(temp);
1582 //look for forward pair
1583 itUniques = uniques.find(forward.name);
1584 if (itUniques != uniques.end()) { //we have the pair for this read
1585 pairFastqRead temp(forward, itUniques->second);
1586 reads.push_back(temp);
1587 uniques.erase(itUniques);
1588 }else { //save this read for later
1589 uniques[forward.name] = forward;
1592 }else if (ignoref && !ignorer) { //ignore forward keep reverse
1595 pairFastqRead temp(dummy, reverse);
1596 reads.push_back(temp);
1598 //look for reverse pair
1599 itUniques = uniques.find(reverse.name);
1600 if (itUniques != uniques.end()) { //we have the pair for this read
1601 pairFastqRead temp(itUniques->second, reverse);
1602 reads.push_back(temp);
1603 uniques.erase(itUniques);
1604 }else { //save this read for later
1605 uniques[reverse.name] = reverse;
1608 }//else ignore both and do nothing
1612 catch(exception& e) {
1613 m->errorOut(e, "MakeContigsCommand", "readFastqFiles");
1617 //**********************************************************************************************************************
1618 //look through the reads from the forward and reverse files and try to find matching reads from index files.
1619 vector<pairFastqRead> MakeContigsCommand::mergeReads(vector<pairFastqRead> thisReads, vector<pairFastqRead> indexes, map<string, pairFastqRead>& uniques){
1621 vector<pairFastqRead> reads;
1622 map<string, pairFastqRead>::iterator itUniques;
1624 set<int> foundIndexes;
1625 for (int i = 0; i < thisReads.size(); i++) {
1627 for (int j = 0; j < indexes.size(); j++) {
1629 //incase only one index
1630 string indexName = indexes[j].forward.name;
1631 if (indexName == "") { indexName = indexes[j].reverse.name; }
1633 if (thisReads[i].forward.name == indexName){
1634 thisReads[i].findex = indexes[j].forward;
1635 thisReads[i].rindex = indexes[j].reverse;
1636 reads.push_back(thisReads[i]);
1638 foundIndexes.insert(j);
1643 //look for forward pair
1644 itUniques = uniques.find('i'+thisReads[i].forward.name);
1645 if (itUniques != uniques.end()) { //we have the pair for this read
1646 thisReads[i].findex = itUniques->second.forward;
1647 thisReads[i].rindex = itUniques->second.reverse;
1648 reads.push_back(thisReads[i]);
1649 uniques.erase(itUniques);
1650 }else { //save this read for later
1651 uniques['r'+thisReads[i].forward.name] = thisReads[i];
1656 if (foundIndexes.size() != indexes.size()) { //if we didnt match all the indexes look for them in uniques
1657 for (int j = 0; j < indexes.size(); j++) {
1658 if (foundIndexes.count(j) == 0) { //we didnt find this one
1659 //incase only one index
1660 string indexName = indexes[j].forward.name;
1661 if (indexName == "") { indexName = indexes[j].reverse.name; }
1663 //look for forward pair
1664 itUniques = uniques.find('r'+indexName);
1665 if (itUniques != uniques.end()) { //we have the pair for this read
1666 pairFastqRead temp(itUniques->second.forward, itUniques->second.reverse, indexes[j].forward, indexes[j].reverse);
1667 reads.push_back(temp);
1668 uniques.erase(itUniques);
1669 }else { //save this read for later
1670 uniques['i'+indexName] = indexes[j];
1679 catch(exception& e) {
1680 m->errorOut(e, "MakeContigsCommand", "mergeReads");
1684 //**********************************************************************************************************************
1685 fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){
1691 //read sequence name
1692 string line = m->getline(in); m->gobble(in);
1693 vector<string> pieces = m->splitWhiteSpace(line);
1694 string name = ""; if (pieces.size() != 0) { name = pieces[0]; }
1695 if (name == "") { m->mothurOut("[WARNING]: Blank fasta name, ignoring read."); m->mothurOutEndLine(); ignore=true; }
1696 else if (name[0] != '@') { m->mothurOut("[WARNING]: reading " + name + " expected a name with @ as a leading character, ignoring read."); m->mothurOutEndLine(); ignore=true; }
1697 else { name = name.substr(1); }
1700 string sequence = m->getline(in); m->gobble(in);
1701 if (sequence == "") { m->mothurOut("[WARNING]: missing sequence for " + name + ", ignoring."); ignore=true; }
1703 //read sequence name
1704 line = m->getline(in); m->gobble(in);
1705 pieces = m->splitWhiteSpace(line);
1706 string name2 = ""; if (pieces.size() != 0) { name2 = pieces[0]; }
1707 if (name2 == "") { m->mothurOut("[WARNING]: expected a name with + as a leading character, ignoring."); ignore=true; }
1708 else if (name2[0] != '+') { m->mothurOut("[WARNING]: reading " + name2 + " expected a name with + as a leading character, ignoring."); ignore=true; }
1709 else { name2 = name2.substr(1); if (name2 == "") { name2 = name; } }
1711 //read quality scores
1712 string quality = m->getline(in); m->gobble(in);
1713 if (quality == "") { m->mothurOut("[WARNING]: missing quality for " + name2 + ", ignoring."); ignore=true; }
1715 //sanity check sequence length and number of quality scores match
1716 if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } }
1717 if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; }
1719 vector<int> qualScores = convertQual(quality);
1723 read.sequence = sequence;
1724 read.scores = qualScores;
1726 if (m->debug) { m->mothurOut("[DEBUG]: " + read.name + " " + read.sequence + " " + quality + "\n"); }
1730 catch(exception& e) {
1731 m->errorOut(e, "MakeContigsCommand", "readFastq");
1735 /**********************************************************************************************************************
1736 bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, string ffile, string rfile){
1740 //do sequence lengths match
1741 if (forward.sequence.length() != reverse.sequence.length()) {
1742 m->mothurOut("[WARNING]: For sequence " + forward.name + " I read a sequence of length " + toString(forward.sequence.length()) + " from " + ffile + ", but read a sequence of length " + toString(reverse.sequence.length()) + " from " + rfile + ", ignoring.\n");
1746 //do number of qual scores match
1747 if (forward.scores.size() != reverse.scores.size()) {
1748 m->mothurOut("[WARNING]: For sequence " + forward.name + " I read " + toString(forward.scores.size()) + " quality scores from " + ffile + ", but read " + toString(reverse.scores.size()) + " quality scores from " + rfile + ", ignoring.\n");
1754 catch(exception& e) {
1755 m->errorOut(e, "MakeContigsCommand", "checkReads");
1759 //***************************************************************************************************************
1760 //lines can be 2, 3, or 4 columns
1761 // forward.fastq reverse.fastq -> 2 column
1762 // groupName forward.fastq reverse.fastq -> 3 column
1763 // forward.fastq reverse.fastq forward.index.fastq reverse.index.fastq -> 4 column
1764 // forward.fastq reverse.fastq none reverse.index.fastq -> 4 column
1765 // forward.fastq reverse.fastq forward.index.fastq none -> 4 column
1766 vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
1768 vector< vector<string> > files;
1769 string forward, reverse, findex, rindex;
1772 m->openInputFile(filename, in);
1776 if (m->control_pressed) { return files; }
1778 string line = m->getline(in); m->gobble(in);
1779 vector<string> pieces = m->splitWhiteSpace(line);
1782 if (pieces.size() == 2) {
1783 forward = pieces[0];
1784 reverse = pieces[1];
1788 }else if (pieces.size() == 3) {
1790 forward = pieces[1];
1791 reverse = pieces[2];
1794 createFileGroup = true;
1795 }else if (pieces.size() == 4) {
1796 forward = pieces[0];
1797 reverse = pieces[1];
1800 if ((findex == "none") || (findex == "NONE")){ findex = ""; }
1801 if ((rindex == "none") || (rindex == "NONE")){ rindex = ""; }
1803 m->mothurOut("[ERROR]: file lines can be 2, 3, or 4 columns. The forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. \n"); m->control_pressed = true;
1806 if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ", forwardIndex = " + findex + ", reverseIndex = " + rindex + ".\n"); }
1808 if (inputDir != "") {
1809 string path = m->hasPath(forward);
1810 if (path == "") { forward = inputDir + forward; }
1812 path = m->hasPath(reverse);
1813 if (path == "") { reverse = inputDir + reverse; }
1816 path = m->hasPath(findex);
1817 if (path == "") { findex = inputDir + findex; }
1821 path = m->hasPath(rindex);
1822 if (path == "") { rindex = inputDir + rindex; }
1826 //check to make sure both are able to be opened
1828 int openForward = m->openInputFile(forward, in2, "noerror");
1830 //if you can't open it, try default location
1831 if (openForward == 1) {
1832 if (m->getDefaultPath() != "") { //default path is set
1833 string tryPath = m->getDefaultPath() + m->getSimpleName(forward);
1834 m->mothurOut("Unable to open " + forward + ". Trying default " + tryPath); m->mothurOutEndLine();
1836 openForward = m->openInputFile(tryPath, in3, "noerror");
1842 //if you can't open it, try output location
1843 if (openForward == 1) {
1844 if (m->getOutputDir() != "") { //default path is set
1845 string tryPath = m->getOutputDir() + m->getSimpleName(forward);
1846 m->mothurOut("Unable to open " + forward + ". Trying output directory " + tryPath); m->mothurOutEndLine();
1848 openForward = m->openInputFile(tryPath, in4, "noerror");
1854 if (openForward == 1) { //can't find it
1855 m->mothurOut("[WARNING]: can't find " + forward + ", ignoring pair.\n");
1856 }else{ in2.close(); }
1859 int openReverse = m->openInputFile(reverse, in3, "noerror");
1861 //if you can't open it, try default location
1862 if (openReverse == 1) {
1863 if (m->getDefaultPath() != "") { //default path is set
1864 string tryPath = m->getDefaultPath() + m->getSimpleName(reverse);
1865 m->mothurOut("Unable to open " + reverse + ". Trying default " + tryPath); m->mothurOutEndLine();
1867 openReverse = m->openInputFile(tryPath, in3, "noerror");
1873 //if you can't open it, try output location
1874 if (openReverse == 1) {
1875 if (m->getOutputDir() != "") { //default path is set
1876 string tryPath = m->getOutputDir() + m->getSimpleName(reverse);
1877 m->mothurOut("Unable to open " + reverse + ". Trying output directory " + tryPath); m->mothurOutEndLine();
1879 openReverse = m->openInputFile(tryPath, in4, "noerror");
1885 if (openReverse == 1) { //can't find it
1886 m->mothurOut("[WARNING]: can't find " + reverse + ", ignoring pair.\n");
1887 }else{ in3.close(); }
1892 openFindex = m->openInputFile(findex, in4, "noerror"); in4.close();
1894 //if you can't open it, try default location
1895 if (openFindex == 1) {
1896 if (m->getDefaultPath() != "") { //default path is set
1897 string tryPath = m->getDefaultPath() + m->getSimpleName(findex);
1898 m->mothurOut("Unable to open " + findex + ". Trying default " + tryPath); m->mothurOutEndLine();
1900 openFindex = m->openInputFile(tryPath, in5, "noerror");
1906 //if you can't open it, try output location
1907 if (openFindex == 1) {
1908 if (m->getOutputDir() != "") { //default path is set
1909 string tryPath = m->getOutputDir() + m->getSimpleName(findex);
1910 m->mothurOut("Unable to open " + findex + ". Trying output directory " + tryPath); m->mothurOutEndLine();
1912 openFindex = m->openInputFile(tryPath, in6, "noerror");
1918 if (openFindex == 1) { //can't find it
1919 m->mothurOut("[WARNING]: can't find " + findex + ", ignoring pair.\n");
1926 openRindex = m->openInputFile(rindex, in7, "noerror"); in7.close();
1928 //if you can't open it, try default location
1929 if (openRindex == 1) {
1930 if (m->getDefaultPath() != "") { //default path is set
1931 string tryPath = m->getDefaultPath() + m->getSimpleName(rindex);
1932 m->mothurOut("Unable to open " + rindex + ". Trying default " + tryPath); m->mothurOutEndLine();
1934 openRindex = m->openInputFile(tryPath, in8, "noerror");
1940 //if you can't open it, try output location
1941 if (openRindex == 1) {
1942 if (m->getOutputDir() != "") { //default path is set
1943 string tryPath = m->getOutputDir() + m->getSimpleName(rindex);
1944 m->mothurOut("Unable to open " + rindex + ". Trying output directory " + tryPath); m->mothurOutEndLine();
1946 openRindex = m->openInputFile(tryPath, in9, "noerror");
1952 if (openRindex == 1) { //can't find it
1953 m->mothurOut("[WARNING]: can't find " + rindex + ", ignoring pair.\n");
1958 if ((openForward != 1) && (openReverse != 1) && (openFindex != 1) && (openRindex != 1)) { //good pair
1959 file2Group[files.size()] = group;
1960 vector<string> pair;
1961 pair.push_back(forward);
1962 pair.push_back(reverse);
1963 pair.push_back(findex);
1964 pair.push_back(rindex);
1965 if (((findex != "") || (rindex != "")) && (oligosfile == "")) { m->mothurOut("[ERROR]: You need to provide an oligos file if you are going to use an index file.\n"); m->control_pressed = true; }
1966 files.push_back(pair);
1973 catch(exception& e) {
1974 m->errorOut(e, "MakeContigsCommand", "readFileNames");
1978 //***************************************************************************************************************
1979 //illumina data requires paired forward and reverse data
1980 //BARCODE atgcatgc atgcatgc groupName
1981 //PRIMER atgcatgc atgcatgc groupName
1982 //PRIMER atgcatgc atgcatgc
1983 bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, string rootname, map<string, string>& fastaFile2Group){
1985 if (m->debug) { m->mothurOut("[DEBUG]: oligosfile = " + oligosfile + "\n"); }
1987 bool allBlank = false;
1988 oligos->read(oligosfile, false);
1990 if (m->control_pressed) { return false; } //error in reading oligos
1992 if (oligos->hasPairedBarcodes()) {
1993 numFPrimers = oligos->getPairedPrimers().size();
1994 numBarcodes = oligos->getPairedBarcodes().size();
1996 m->mothurOut("[ERROR]: make.contigs requires paired barcodes and primers. You can set one end to NONE if you are using an index file.\n"); m->control_pressed = true;
1999 if (m->control_pressed) { return false; }
2001 numLinkers = oligos->getLinkers().size();
2002 numSpacers = oligos->getSpacers().size();
2003 numRPrimers = oligos->getReversePrimers().size();
2004 if (numLinkers != 0) { m->mothurOut("[WARNING]: make.contigs is not setup to remove linkers, ignoring.\n"); }
2005 if (numSpacers != 0) { m->mothurOut("[WARNING]: make.contigs is not setup to remove spacers, ignoring.\n"); }
2007 vector<string> groupNames = oligos->getGroupNames();
2008 if (groupNames.size() == 0) { allFiles = 0; allBlank = true; }
2011 fastaFileNames.resize(oligos->getBarcodeNames().size());
2012 for(int i=0;i<fastaFileNames.size();i++){
2013 for(int j=0;j<oligos->getPrimerNames().size();j++){ fastaFileNames[i].push_back(""); }
2017 set<string> uniqueNames; //used to cleanup outputFileNames
2018 map<int, oligosPair> barcodes = oligos->getPairedBarcodes();
2019 map<int, oligosPair> primers = oligos->getPairedPrimers();
2020 for(map<int, oligosPair>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
2021 for(map<int, oligosPair>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
2023 string primerName = oligos->getPrimerName(itPrimer->first);
2024 string barcodeName = oligos->getBarcodeName(itBar->first);
2026 if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing
2027 else if ((primerName == "") && (barcodeName == "")) { } //do nothing
2029 string comboGroupName = "";
2030 string fastaFileName = "";
2031 string qualFileName = "";
2032 string nameFileName = "";
2033 string countFileName = "";
2035 if(primerName == ""){
2036 comboGroupName = barcodeName;
2038 if(barcodeName == ""){
2039 comboGroupName = primerName;
2042 comboGroupName = barcodeName + "." + primerName;
2048 map<string, string> variables;
2049 variables["[filename]"] = rootname;
2050 variables["[tag]"] = comboGroupName;
2051 fastaFileName = getOutputFileName("fasta", variables);
2052 if (uniqueNames.count(fastaFileName) == 0) {
2053 outputNames.push_back(fastaFileName);
2054 outputTypes["fasta"].push_back(fastaFileName);
2055 uniqueNames.insert(fastaFileName);
2056 fastaFile2Group[fastaFileName] = comboGroupName;
2059 fastaFileNames[itBar->first][itPrimer->first] = fastaFileName;
2060 m->openOutputFile(fastaFileName, temp); temp.close();
2061 cout << fastaFileName << endl;
2068 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
2076 catch(exception& e) {
2077 m->errorOut(e, "MakeContigsCommand", "getOligos");
2081 //**********************************************************************************************************************
2082 vector<int> MakeContigsCommand::convertQual(string qual) {
2084 vector<int> qualScores;
2085 bool negativeScores = false;
2087 for (int i = 0; i < qual.length(); i++) {
2090 temp = int(qual[i]);
2091 if (format == "illumina") {
2092 temp -= 64; //char '@'
2093 }else if (format == "illumina1.8+") {
2094 temp -= int('!'); //char '!'
2095 }else if (format == "solexa") {
2096 temp = int(convertTable[temp]); //convert to sanger
2097 temp -= int('!'); //char '!'
2099 temp -= int('!'); //char '!'
2102 if (temp < -5) { negativeScores = true; }
2103 qualScores.push_back(temp);
2106 if (negativeScores) { m->mothurOut("[ERROR]: finding negative quality scores, do you have the right format selected? http://en.wikipedia.org/wiki/FASTQ_format#Encoding \n"); m->control_pressed = true; }
2110 catch(exception& e) {
2111 m->errorOut(e, "MakeContigsCommand", "convertQual");
2116 //**********************************************************************************************************************