5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
15 //**********************************************************************************************************************
16 vector<string> TrimSeqsCommand::setParameters(){
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
19 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
20 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false,true); parameters.push_back(pqfile);
21 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname);
22 CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount);
23 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(pflip);
24 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig);
25 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxhomop);
26 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pminlength);
27 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxlength);
28 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
29 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
30 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs);
31 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs);
32 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
33 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
34 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles);
35 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepforward);
36 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pqtrim);
37 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqthreshold);
38 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqaverage);
39 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(prollaverage);
40 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqwindowaverage);
41 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pqstepsize);
42 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "","",false,false); parameters.push_back(pqwindowsize);
43 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pkeepfirst);
44 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "","",false,false); parameters.push_back(premovelast);
45 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
46 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
48 vector<string> myArray;
49 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
53 m->errorOut(e, "TrimSeqsCommand", "setParameters");
57 //**********************************************************************************************************************
58 string TrimSeqsCommand::getHelpString(){
60 string helpString = "";
61 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
62 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
63 helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
64 helpString += "The fasta parameter is required.\n";
65 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
66 helpString += "The oligos parameter allows you to provide an oligos file.\n";
67 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
68 helpString += "The count parameter allows you to provide a count file with your fasta file.\n";
69 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
70 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
71 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
72 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
73 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
74 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
75 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
76 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
77 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
78 helpString += "The qfile parameter allows you to provide a quality file.\n";
79 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
80 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
81 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
82 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
83 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
84 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
85 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
86 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
87 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
88 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
89 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
90 helpString += "The trim.seqs command should be in the following format: \n";
91 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
92 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
93 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
94 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
95 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
99 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
103 //**********************************************************************************************************************
104 string TrimSeqsCommand::getOutputPattern(string type) {
108 if (type == "qfile") { pattern = "[filename],[tag],qual"; }
109 else if (type == "fasta") { pattern = "[filename],[tag],fasta"; }
110 else if (type == "group") { pattern = "[filename],groups"; }
111 else if (type == "name") { pattern = "[filename],[tag],names"; }
112 else if (type == "count") { pattern = "[filename],[tag],count_table-[filename],count_table"; }
113 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
117 catch(exception& e) {
118 m->errorOut(e, "TrimSeqsCommand", "getOutputPattern");
122 //**********************************************************************************************************************
124 TrimSeqsCommand::TrimSeqsCommand(){
126 abort = true; calledHelp = true;
128 vector<string> tempOutNames;
129 outputTypes["fasta"] = tempOutNames;
130 outputTypes["qfile"] = tempOutNames;
131 outputTypes["group"] = tempOutNames;
132 outputTypes["name"] = tempOutNames;
133 outputTypes["count"] = tempOutNames;
135 catch(exception& e) {
136 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
140 //***************************************************************************************************************
142 TrimSeqsCommand::TrimSeqsCommand(string option) {
145 abort = false; calledHelp = false;
148 //allow user to run help
149 if(option == "help") { help(); abort = true; calledHelp = true; }
150 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
153 vector<string> myArray = setParameters();
155 OptionParser parser(option);
156 map<string,string> parameters = parser.getParameters();
158 ValidParameters validParameter;
159 map<string,string>::iterator it;
161 //check to make sure all parameters are valid for command
162 for (it = parameters.begin(); it != parameters.end(); it++) {
163 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
166 //initialize outputTypes
167 vector<string> tempOutNames;
168 outputTypes["fasta"] = tempOutNames;
169 outputTypes["qfile"] = tempOutNames;
170 outputTypes["group"] = tempOutNames;
171 outputTypes["name"] = tempOutNames;
172 outputTypes["count"] = tempOutNames;
174 //if the user changes the input directory command factory will send this info to us in the output parameter
175 string inputDir = validParameter.validFile(parameters, "inputdir", false);
176 if (inputDir == "not found"){ inputDir = ""; }
179 it = parameters.find("fasta");
180 //user has given a template file
181 if(it != parameters.end()){
182 path = m->hasPath(it->second);
183 //if the user has not given a path then, add inputdir. else leave path alone.
184 if (path == "") { parameters["fasta"] = inputDir + it->second; }
187 it = parameters.find("oligos");
188 //user has given a template file
189 if(it != parameters.end()){
190 path = m->hasPath(it->second);
191 //if the user has not given a path then, add inputdir. else leave path alone.
192 if (path == "") { parameters["oligos"] = inputDir + it->second; }
195 it = parameters.find("qfile");
196 //user has given a template file
197 if(it != parameters.end()){
198 path = m->hasPath(it->second);
199 //if the user has not given a path then, add inputdir. else leave path alone.
200 if (path == "") { parameters["qfile"] = inputDir + it->second; }
203 it = parameters.find("name");
204 //user has given a template file
205 if(it != parameters.end()){
206 path = m->hasPath(it->second);
207 //if the user has not given a path then, add inputdir. else leave path alone.
208 if (path == "") { parameters["name"] = inputDir + it->second; }
211 it = parameters.find("count");
212 //user has given a template file
213 if(it != parameters.end()){
214 path = m->hasPath(it->second);
215 //if the user has not given a path then, add inputdir. else leave path alone.
216 if (path == "") { parameters["count"] = inputDir + it->second; }
222 //check for required parameters
223 fastaFile = validParameter.validFile(parameters, "fasta", true);
224 if (fastaFile == "not found") {
225 fastaFile = m->getFastaFile();
226 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
227 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
228 }else if (fastaFile == "not open") { abort = true; }
229 else { m->setFastaFile(fastaFile); }
231 //if the user changes the output directory command factory will send this info to us in the output parameter
232 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
234 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
238 //check for optional parameter and set defaults
239 // ...at some point should added some additional type checking...
241 temp = validParameter.validFile(parameters, "flip", false);
242 if (temp == "not found") { flip = 0; }
243 else { flip = m->isTrue(temp); }
245 temp = validParameter.validFile(parameters, "oligos", true);
246 if (temp == "not found"){ oligoFile = ""; }
247 else if(temp == "not open"){ abort = true; }
248 else { oligoFile = temp; m->setOligosFile(oligoFile); }
251 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
252 m->mothurConvert(temp, maxAmbig);
254 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
255 m->mothurConvert(temp, maxHomoP);
257 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
258 m->mothurConvert(temp, minLength);
260 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
261 m->mothurConvert(temp, maxLength);
263 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
264 m->mothurConvert(temp, bdiffs);
266 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
267 m->mothurConvert(temp, pdiffs);
269 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
270 m->mothurConvert(temp, ldiffs);
272 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
273 m->mothurConvert(temp, sdiffs);
275 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
276 m->mothurConvert(temp, tdiffs);
278 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
280 temp = validParameter.validFile(parameters, "qfile", true);
281 if (temp == "not found") { qFileName = ""; }
282 else if(temp == "not open") { abort = true; }
283 else { qFileName = temp; m->setQualFile(qFileName); }
285 temp = validParameter.validFile(parameters, "name", true);
286 if (temp == "not found") { nameFile = ""; }
287 else if(temp == "not open") { nameFile = ""; abort = true; }
288 else { nameFile = temp; m->setNameFile(nameFile); }
290 countfile = validParameter.validFile(parameters, "count", true);
291 if (countfile == "not open") { abort = true; countfile = ""; }
292 else if (countfile == "not found") { countfile = ""; }
293 else { m->setCountTableFile(countfile); }
295 if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
297 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
298 m->mothurConvert(temp, qThreshold);
300 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
301 qtrim = m->isTrue(temp);
303 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
304 convert(temp, qRollAverage);
306 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
307 convert(temp, qWindowAverage);
309 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
310 convert(temp, qWindowSize);
312 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
313 convert(temp, qWindowStep);
315 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
316 convert(temp, qAverage);
318 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
319 convert(temp, keepFirst);
321 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
322 convert(temp, removeLast);
324 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
325 allFiles = m->isTrue(temp);
327 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
328 keepforward = m->isTrue(temp);
330 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
331 m->setProcessors(temp);
332 m->mothurConvert(temp, processors);
335 if(allFiles && (oligoFile == "")){
336 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
338 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
339 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
343 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
344 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
348 if (countfile == "") {
349 if (nameFile == "") {
350 vector<string> files; files.push_back(fastaFile);
351 parser.getNameFile(files);
357 catch(exception& e) {
358 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
362 //***************************************************************************************************************
364 int TrimSeqsCommand::execute(){
367 if (abort == true) { if (calledHelp) { return 0; } return 2; }
369 numFPrimers = 0; //this needs to be initialized
374 vector<vector<string> > fastaFileNames;
375 vector<vector<string> > qualFileNames;
376 vector<vector<string> > nameFileNames;
378 map<string, string> variables;
379 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
380 variables["[tag]"] = "trim";
381 string trimSeqFile = getOutputFileName("fasta",variables);
382 string trimQualFile = getOutputFileName("qfile",variables);
383 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
385 variables["[tag]"] = "scrap";
386 string scrapSeqFile = getOutputFileName("fasta",variables);
387 string scrapQualFile = getOutputFileName("qfile",variables);
388 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
390 if (qFileName != "") {
391 outputNames.push_back(trimQualFile);
392 outputNames.push_back(scrapQualFile);
393 outputTypes["qfile"].push_back(trimQualFile);
394 outputTypes["qfile"].push_back(scrapQualFile);
397 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
398 variables["[tag]"] = "trim";
399 string trimNameFile = getOutputFileName("name",variables);
400 variables["[tag]"] = "scrap";
401 string scrapNameFile = getOutputFileName("name",variables);
403 if (nameFile != "") {
404 m->readNames(nameFile, nameMap);
405 outputNames.push_back(trimNameFile);
406 outputNames.push_back(scrapNameFile);
407 outputTypes["name"].push_back(trimNameFile);
408 outputTypes["name"].push_back(scrapNameFile);
411 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
412 variables["[tag]"] = "trim";
413 string trimCountFile = getOutputFileName("count",variables);
414 variables["[tag]"] = "scrap";
415 string scrapCountFile = getOutputFileName("count",variables);
417 if (countfile != "") {
419 ct.readTable(countfile);
420 nameCount = ct.getNameMap();
421 outputNames.push_back(trimCountFile);
422 outputNames.push_back(scrapCountFile);
423 outputTypes["count"].push_back(trimCountFile);
424 outputTypes["count"].push_back(scrapCountFile);
428 if (m->control_pressed) { return 0; }
430 string outputGroupFileName;
432 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
433 if ((createGroup) && (countfile == "")){
434 map<string, string> myvariables;
435 myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
436 outputGroupFileName = getOutputFileName("group",myvariables);
437 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
441 //fills lines and qlines
442 setLines(fastaFile, qFileName);
445 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
447 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
451 if (m->control_pressed) { return 0; }
454 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
455 map<string, string>::iterator it;
456 set<string> namesToRemove;
457 for(int i=0;i<fastaFileNames.size();i++){
458 for(int j=0;j<fastaFileNames[0].size();j++){
459 if (fastaFileNames[i][j] != "") {
460 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
461 if(m->isBlank(fastaFileNames[i][j])){
462 m->mothurRemove(fastaFileNames[i][j]);
463 namesToRemove.insert(fastaFileNames[i][j]);
466 m->mothurRemove(qualFileNames[i][j]);
467 namesToRemove.insert(qualFileNames[i][j]);
471 m->mothurRemove(nameFileNames[i][j]);
472 namesToRemove.insert(nameFileNames[i][j]);
475 it = uniqueFastaNames.find(fastaFileNames[i][j]);
476 if (it == uniqueFastaNames.end()) {
477 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
485 //remove names for outputFileNames, just cleans up the output
486 vector<string> outputNames2;
487 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
488 outputNames = outputNames2;
490 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
492 m->openInputFile(it->first, in);
495 map<string, string> myvariables;
496 myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(it->first));
497 string thisGroupName = "";
498 if (countfile == "") { thisGroupName = getOutputFileName("group",myvariables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
499 else { thisGroupName = getOutputFileName("count",myvariables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
500 m->openOutputFile(thisGroupName, out);
502 if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; }
505 if (m->control_pressed) { break; }
507 Sequence currSeq(in); m->gobble(in);
508 if (countfile == "") {
509 out << currSeq.getName() << '\t' << it->second << endl;
511 if (nameFile != "") {
512 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
513 if (itName != nameMap.end()) {
514 vector<string> thisSeqsNames;
515 m->splitAtChar(itName->second, thisSeqsNames, ',');
516 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
517 out << thisSeqsNames[k] << '\t' << it->second << endl;
519 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
522 map<string, int>::iterator itTotalReps = nameCount.find(currSeq.getName());
523 if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; }
524 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
531 if (countfile != "") { //create countfile with group info included
532 CountTable* ct = new CountTable();
533 ct->readTable(trimCountFile);
534 map<string, int> justTrimmedNames = ct->getNameMap();
538 for (map<string, int>::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); }
539 vector<int> tempCounts; tempCounts.resize(groupCounts.size(), 0);
540 for (map<string, int>::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) {
541 newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance
542 map<string, string>::iterator it2 = groupMap.find(itNames->first);
543 if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); }
544 else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; }
546 newCt.printTable(trimCountFile);
550 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
552 //output group counts
553 m->mothurOutEndLine();
555 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
556 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
557 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
559 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
561 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
563 //set fasta file as new current fastafile
565 itTypes = outputTypes.find("fasta");
566 if (itTypes != outputTypes.end()) {
567 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
570 itTypes = outputTypes.find("name");
571 if (itTypes != outputTypes.end()) {
572 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
575 itTypes = outputTypes.find("qfile");
576 if (itTypes != outputTypes.end()) {
577 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
580 itTypes = outputTypes.find("group");
581 if (itTypes != outputTypes.end()) {
582 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
585 itTypes = outputTypes.find("count");
586 if (itTypes != outputTypes.end()) {
587 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
590 m->mothurOutEndLine();
591 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
592 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
593 m->mothurOutEndLine();
598 catch(exception& e) {
599 m->errorOut(e, "TrimSeqsCommand", "execute");
604 /**************************************************************************************/
605 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
609 ofstream trimFASTAFile;
610 m->openOutputFile(trimFileName, trimFASTAFile);
612 ofstream scrapFASTAFile;
613 m->openOutputFile(scrapFileName, scrapFASTAFile);
615 ofstream trimQualFile;
616 ofstream scrapQualFile;
618 m->openOutputFile(trimQFileName, trimQualFile);
619 m->openOutputFile(scrapQFileName, scrapQualFile);
622 ofstream trimNameFile;
623 ofstream scrapNameFile;
625 m->openOutputFile(trimNFileName, trimNameFile);
626 m->openOutputFile(scrapNFileName, scrapNameFile);
629 ofstream trimCountFile;
630 ofstream scrapCountFile;
632 m->openOutputFile(trimCFileName, trimCountFile);
633 m->openOutputFile(scrapCFileName, scrapCountFile);
634 if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
637 ofstream outGroupsFile;
638 if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); }
640 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
641 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
642 if (fastaFileNames[i][j] != "") {
644 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
646 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
650 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
658 m->openInputFile(filename, inFASTA);
659 inFASTA.seekg(line.start);
662 if(qFileName != "") {
663 m->openInputFile(qFileName, qFile);
664 qFile.seekg(qline.start);
669 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
673 if (m->control_pressed) {
674 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
675 if ((createGroup) && (countfile == "")) { outGroupsFile.close(); }
676 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
677 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
678 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
679 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;
683 string trashCode = "";
684 int currentSeqsDiffs = 0;
686 Sequence currSeq(inFASTA); m->gobble(inFASTA);
687 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
689 QualityScores currQual;
691 currQual = QualityScores(qFile); m->gobble(qFile);
692 //cout << currQual.getName() << endl;
695 string origSeq = currSeq.getUnaligned();
698 int barcodeIndex = 0;
702 success = trimOligos.stripLinker(currSeq, currQual);
703 if(success > ldiffs) { trashCode += 'k'; }
704 else{ currentSeqsDiffs += success; }
708 if(barcodes.size() != 0){
709 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
710 if(success > bdiffs) { trashCode += 'b'; }
711 else{ currentSeqsDiffs += success; }
715 success = trimOligos.stripSpacer(currSeq, currQual);
716 if(success > sdiffs) { trashCode += 's'; }
717 else{ currentSeqsDiffs += success; }
721 if(numFPrimers != 0){
722 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
723 if(success > pdiffs) { trashCode += 'f'; }
724 else{ currentSeqsDiffs += success; }
727 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
729 if(numRPrimers != 0){
730 success = trimOligos.stripReverse(currSeq, currQual);
731 if(!success) { trashCode += 'r'; }
735 success = keepFirstTrim(currSeq, currQual);
739 success = removeLastTrim(currSeq, currQual);
740 if(!success) { trashCode += 'l'; }
745 int origLength = currSeq.getNumBases();
747 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
748 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
749 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
750 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
751 else { success = 1; }
753 //you don't want to trim, if it fails above then scrap it
754 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
756 if(!success) { trashCode += 'q'; }
759 if(minLength > 0 || maxLength > 0){
760 success = cullLength(currSeq);
761 if(!success) { trashCode += 'l'; }
764 success = cullHomoP(currSeq);
765 if(!success) { trashCode += 'h'; }
768 success = cullAmbigs(currSeq);
769 if(!success) { trashCode += 'n'; }
772 if(flip){ // should go last
773 currSeq.reverseComplement();
775 currQual.flipQScores();
779 if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
781 if(trashCode.length() == 0){
782 string thisGroup = "";
784 if(barcodes.size() != 0){
785 thisGroup = barcodeNameVector[barcodeIndex];
786 if (primers.size() != 0) {
787 if (primerNameVector[primerIndex] != "") {
788 if(thisGroup != "") {
789 thisGroup += "." + primerNameVector[primerIndex];
791 thisGroup = primerNameVector[primerIndex];
798 int pos = thisGroup.find("ignore");
799 if (pos == string::npos) {
800 currSeq.setAligned(currSeq.getUnaligned());
801 currSeq.printSequence(trimFASTAFile);
804 currQual.printQScores(trimQualFile);
809 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
810 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
811 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
814 int numRedundants = 0;
815 if (countfile != "") {
816 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
817 if (itCount != nameCount.end()) {
818 trimCountFile << itCount->first << '\t' << itCount->second << endl;
819 numRedundants = itCount->second-1;
820 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
824 if(barcodes.size() != 0){
826 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
828 if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
829 else { groupMap[currSeq.getName()] = thisGroup; }
831 if (nameFile != "") {
832 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
833 if (itName != nameMap.end()) {
834 vector<string> thisSeqsNames;
835 m->splitAtChar(itName->second, thisSeqsNames, ',');
836 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
837 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
838 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
840 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
843 map<string, int>::iterator it = groupCounts.find(thisGroup);
844 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
845 else { groupCounts[it->first] += (1 + numRedundants); }
852 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
853 currSeq.printSequence(output);
857 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
858 currQual.printQScores(output);
863 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
864 if (itName != nameMap.end()) {
865 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
866 output << itName->first << '\t' << itName->second << endl;
868 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
874 if(nameFile != ""){ //needs to be before the currSeq name is changed
875 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
876 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
877 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
879 if (countfile != "") {
880 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
881 if (itCount != nameCount.end()) {
882 trimCountFile << itCount->first << '\t' << itCount->second << endl;
883 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
886 currSeq.setName(currSeq.getName() + '|' + trashCode);
887 currSeq.setUnaligned(origSeq);
888 currSeq.setAligned(origSeq);
889 currSeq.printSequence(scrapFASTAFile);
891 currQual.printQScores(scrapQualFile);
897 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
898 unsigned long long pos = inFASTA.tellg();
899 if ((pos == -1) || (pos >= line.end)) { break; }
902 if (inFASTA.eof()) { break; }
906 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
910 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
914 trimFASTAFile.close();
915 scrapFASTAFile.close();
916 if (createGroup) { outGroupsFile.close(); }
917 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
918 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
919 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
923 catch(exception& e) {
924 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
929 /**************************************************************************************************/
931 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
938 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
939 //loop through and create all the processes you want
940 while (process != processors) {
944 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
948 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
949 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
950 vector<vector<string> > tempNameFileNames = nameFileNames;
955 for(int i=0;i<tempFASTAFileNames.size();i++){
956 for(int j=0;j<tempFASTAFileNames[i].size();j++){
957 if (tempFASTAFileNames[i][j] != "") {
958 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
959 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
962 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
963 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
966 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
967 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
974 driverCreateTrim(filename,
976 (trimFASTAFileName + toString(getpid()) + ".temp"),
977 (scrapFASTAFileName + toString(getpid()) + ".temp"),
978 (trimQualFileName + toString(getpid()) + ".temp"),
979 (scrapQualFileName + toString(getpid()) + ".temp"),
980 (trimNameFileName + toString(getpid()) + ".temp"),
981 (scrapNameFileName + toString(getpid()) + ".temp"),
982 (trimCountFileName + toString(getpid()) + ".temp"),
983 (scrapCountFileName + toString(getpid()) + ".temp"),
984 (groupFile + toString(getpid()) + ".temp"),
986 tempPrimerQualFileNames,
991 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
993 //pass groupCounts to parent
996 string tempFile = filename + toString(getpid()) + ".num.temp";
997 m->openOutputFile(tempFile, out);
999 out << groupCounts.size() << endl;
1001 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
1002 out << it->first << '\t' << it->second << endl;
1005 out << groupMap.size() << endl;
1006 for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
1007 out << it->first << '\t' << it->second << endl;
1013 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1014 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1021 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1022 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1023 if(qFileName != ""){
1024 m->openOutputFile(trimQualFileName, temp); temp.close();
1025 m->openOutputFile(scrapQualFileName, temp); temp.close();
1027 if (nameFile != "") {
1028 m->openOutputFile(trimNameFileName, temp); temp.close();
1029 m->openOutputFile(scrapNameFileName, temp); temp.close();
1031 if (countfile != "") {
1032 m->openOutputFile(trimCountFileName, temp); temp.close();
1033 m->openOutputFile(scrapCountFileName, temp); temp.close();
1036 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
1038 //force parent to wait until all the processes are done
1039 for (int i=0;i<processIDS.size();i++) {
1040 int temp = processIDS[i];
1044 //////////////////////////////////////////////////////////////////////////////////////////////////////
1045 //Windows version shared memory, so be careful when passing variables through the trimData struct.
1046 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1047 //////////////////////////////////////////////////////////////////////////////////////////////////////
1049 vector<trimData*> pDataArray;
1050 DWORD dwThreadIdArray[processors-1];
1051 HANDLE hThreadArray[processors-1];
1053 //Create processor worker threads.
1054 for( int h=0; h<processors-1; h++){
1056 string extension = "";
1057 if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
1058 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1059 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1060 vector<vector<string> > tempNameFileNames = nameFileNames;
1065 for(int i=0;i<tempFASTAFileNames.size();i++){
1066 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1067 if (tempFASTAFileNames[i][j] != "") {
1068 tempFASTAFileNames[i][j] += extension;
1069 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1071 if(qFileName != ""){
1072 tempPrimerQualFileNames[i][j] += extension;
1073 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1076 tempNameFileNames[i][j] += extension;
1077 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1085 trimData* tempTrim = new trimData(filename,
1086 qFileName, nameFile, countfile,
1087 (trimFASTAFileName+extension),
1088 (scrapFASTAFileName+extension),
1089 (trimQualFileName+extension),
1090 (scrapQualFileName+extension),
1091 (trimNameFileName+extension),
1092 (scrapNameFileName+extension),
1093 (trimCountFileName+extension),
1094 (scrapCountFileName+extension),
1095 (groupFile+extension),
1097 tempPrimerQualFileNames,
1099 lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m,
1100 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
1101 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
1102 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
1103 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
1104 pDataArray.push_back(tempTrim);
1106 hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
1111 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1112 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1113 if(qFileName != ""){
1114 m->openOutputFile(trimQualFileName, temp); temp.close();
1115 m->openOutputFile(scrapQualFileName, temp); temp.close();
1117 if (nameFile != "") {
1118 m->openOutputFile(trimNameFileName, temp); temp.close();
1119 m->openOutputFile(scrapNameFileName, temp); temp.close();
1121 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1122 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1123 vector<vector<string> > tempNameFileNames = nameFileNames;
1126 string extension = toString(processors-1) + ".temp";
1127 for(int i=0;i<tempFASTAFileNames.size();i++){
1128 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1129 if (tempFASTAFileNames[i][j] != "") {
1130 tempFASTAFileNames[i][j] += extension;
1131 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1133 if(qFileName != ""){
1134 tempPrimerQualFileNames[i][j] += extension;
1135 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1138 tempNameFileNames[i][j] += extension;
1139 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1146 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]);
1147 processIDS.push_back(processors-1);
1150 //Wait until all threads have terminated.
1151 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1153 //Close all thread handles and free memory allocations.
1154 for(int i=0; i < pDataArray.size(); i++){
1155 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
1156 map<string, int>::iterator it2 = groupCounts.find(it->first);
1157 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
1158 else { groupCounts[it->first] += it->second; }
1160 for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
1161 map<string, string>::iterator it2 = groupMap.find(it->first);
1162 if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
1163 else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1165 CloseHandle(hThreadArray[i]);
1166 delete pDataArray[i];
1173 for(int i=0;i<processIDS.size();i++){
1175 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1177 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1178 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1179 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1180 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1182 if(qFileName != ""){
1183 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1184 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1185 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1186 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1190 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1191 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1192 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1193 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1196 if(countfile != ""){
1197 m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName);
1198 m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp"));
1199 m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName);
1200 m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp"));
1203 if((createGroup)&&(countfile == "")){
1204 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1205 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1210 for(int j=0;j<fastaFileNames.size();j++){
1211 for(int k=0;k<fastaFileNames[j].size();k++){
1212 if (fastaFileNames[j][k] != "") {
1213 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1214 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1216 if(qFileName != ""){
1217 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1218 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1222 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1223 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1230 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1233 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1234 m->openInputFile(tempFile, in);
1238 in >> tempNum; m->gobble(in);
1241 for (int i = 0; i < tempNum; i++) {
1243 in >> group >> groupNum; m->gobble(in);
1245 map<string, int>::iterator it = groupCounts.find(group);
1246 if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
1247 else { groupCounts[it->first] += groupNum; }
1250 in >> tempNum; m->gobble(in);
1252 for (int i = 0; i < tempNum; i++) {
1253 string group, seqName;
1254 in >> seqName >> group; m->gobble(in);
1256 map<string, string>::iterator it = groupMap.find(seqName);
1257 if (it == groupMap.end()) { groupMap[seqName] = group; }
1258 else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1262 in.close(); m->mothurRemove(tempFile);
1269 catch(exception& e) {
1270 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1275 /**************************************************************************************************/
1277 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1280 vector<unsigned long long> fastaFilePos;
1281 vector<unsigned long long> qfileFilePos;
1283 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1284 //set file positions for fasta file
1285 fastaFilePos = m->divideFile(filename, processors);
1287 //get name of first sequence in each chunk
1288 map<string, int> firstSeqNames;
1289 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1291 m->openInputFile(filename, in);
1292 in.seekg(fastaFilePos[i]);
1295 firstSeqNames[temp.getName()] = i;
1300 if(qfilename != "") {
1301 //seach for filePos of each first name in the qfile and save in qfileFilePos
1303 m->openInputFile(qfilename, inQual);
1306 while(!inQual.eof()){
1307 input = m->getline(inQual);
1309 if (input.length() != 0) {
1310 if(input[0] == '>'){ //this is a sequence name line
1311 istringstream nameStream(input);
1313 string sname = ""; nameStream >> sname;
1314 sname = sname.substr(1);
1316 map<string, int>::iterator it = firstSeqNames.find(sname);
1318 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1319 unsigned long long pos = inQual.tellg();
1320 qfileFilePos.push_back(pos - input.length() - 1);
1321 firstSeqNames.erase(it);
1326 if (firstSeqNames.size() == 0) { break; }
1331 if (firstSeqNames.size() != 0) {
1332 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1333 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1339 //get last file position of qfile
1341 unsigned long long size;
1343 //get num bytes in file
1344 pFile = fopen (qfilename.c_str(),"rb");
1345 if (pFile==NULL) perror ("Error opening file");
1347 fseek (pFile, 0, SEEK_END);
1352 qfileFilePos.push_back(size);
1355 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1356 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
1357 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1358 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1360 if(qfilename == "") { qLines = lines; } //files with duds
1366 if (processors == 1) { //save time
1367 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1368 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1369 lines.push_back(linePair(0, 1000));
1370 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1372 int numFastaSeqs = 0;
1373 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1374 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1376 if (qfilename != "") {
1377 int numQualSeqs = 0;
1378 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1380 if (numFastaSeqs != numQualSeqs) {
1381 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1385 //figure out how many sequences you have to process
1386 int numSeqsPerProcessor = numFastaSeqs / processors;
1387 for (int i = 0; i < processors; i++) {
1388 int startIndex = i * numSeqsPerProcessor;
1389 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1390 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1391 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1394 if(qfilename == "") { qLines = lines; } //files with duds
1399 catch(exception& e) {
1400 m->errorOut(e, "TrimSeqsCommand", "setLines");
1405 //***************************************************************************************************************
1407 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1410 m->openInputFile(oligoFile, inOligos);
1414 string type, oligo, group;
1416 int indexPrimer = 0;
1417 int indexBarcode = 0;
1419 while(!inOligos.eof()){
1423 if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
1426 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1427 m->gobble(inOligos);
1430 m->gobble(inOligos);
1431 //make type case insensitive
1432 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1436 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
1438 for(int i=0;i<oligo.length();i++){
1439 oligo[i] = toupper(oligo[i]);
1440 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1443 if(type == "FORWARD"){
1446 // get rest of line in case there is a primer name
1447 while (!inOligos.eof()) {
1448 char c = inOligos.get();
1449 if (c == 10 || c == 13){ break; }
1450 else if (c == 32 || c == 9){;} //space or tab
1451 else { group += c; }
1454 //check for repeat barcodes
1455 map<string, int>::iterator itPrime = primers.find(oligo);
1456 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1458 if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } }
1460 primers[oligo]=indexPrimer; indexPrimer++;
1461 primerNameVector.push_back(group);
1463 else if(type == "REVERSE"){
1464 //Sequence oligoRC("reverse", oligo);
1465 //oligoRC.reverseComplement();
1466 string oligoRC = reverseOligo(oligo);
1467 revPrimer.push_back(oligoRC);
1469 else if(type == "BARCODE"){
1472 //check for repeat barcodes
1473 map<string, int>::iterator itBar = barcodes.find(oligo);
1474 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1476 barcodes[oligo]=indexBarcode; indexBarcode++;
1477 barcodeNameVector.push_back(group);
1478 }else if(type == "LINKER"){
1479 linker.push_back(oligo);
1480 }else if(type == "SPACER"){
1481 spacer.push_back(oligo);
1483 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1485 m->gobble(inOligos);
1489 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1491 //add in potential combos
1492 if(barcodeNameVector.size() == 0){
1494 barcodeNameVector.push_back("");
1497 if(primerNameVector.size() == 0){
1499 primerNameVector.push_back("");
1502 fastaFileNames.resize(barcodeNameVector.size());
1503 for(int i=0;i<fastaFileNames.size();i++){
1504 fastaFileNames[i].assign(primerNameVector.size(), "");
1506 if(qFileName != "") { qualFileNames = fastaFileNames; }
1507 if(nameFile != "") { nameFileNames = fastaFileNames; }
1510 set<string> uniqueNames; //used to cleanup outputFileNames
1511 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1512 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1514 string primerName = primerNameVector[itPrimer->second];
1515 string barcodeName = barcodeNameVector[itBar->second];
1517 if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing
1519 string comboGroupName = "";
1520 string fastaFileName = "";
1521 string qualFileName = "";
1522 string nameFileName = "";
1523 string countFileName = "";
1525 if(primerName == ""){
1526 comboGroupName = barcodeNameVector[itBar->second];
1529 if(barcodeName == ""){
1530 comboGroupName = primerNameVector[itPrimer->second];
1533 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1539 map<string, string> variables;
1540 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
1541 variables["[tag]"] = comboGroupName;
1542 fastaFileName = getOutputFileName("fasta", variables);
1543 if (uniqueNames.count(fastaFileName) == 0) {
1544 outputNames.push_back(fastaFileName);
1545 outputTypes["fasta"].push_back(fastaFileName);
1546 uniqueNames.insert(fastaFileName);
1549 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1550 m->openOutputFile(fastaFileName, temp); temp.close();
1552 if(qFileName != ""){
1553 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qFileName));
1554 qualFileName = getOutputFileName("qfile", variables);
1555 if (uniqueNames.count(qualFileName) == 0) {
1556 outputNames.push_back(qualFileName);
1557 outputTypes["qfile"].push_back(qualFileName);
1560 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1561 m->openOutputFile(qualFileName, temp); temp.close();
1565 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
1566 nameFileName = getOutputFileName("name", variables);
1567 if (uniqueNames.count(nameFileName) == 0) {
1568 outputNames.push_back(nameFileName);
1569 outputTypes["name"].push_back(nameFileName);
1572 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1573 m->openOutputFile(nameFileName, temp); temp.close();
1579 numFPrimers = primers.size();
1580 numRPrimers = revPrimer.size();
1581 numLinkers = linker.size();
1582 numSpacers = spacer.size();
1584 bool allBlank = true;
1585 for (int i = 0; i < barcodeNameVector.size(); i++) {
1586 if (barcodeNameVector[i] != "") {
1591 for (int i = 0; i < primerNameVector.size(); i++) {
1592 if (primerNameVector[i] != "") {
1599 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1607 catch(exception& e) {
1608 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1612 //***************************************************************************************************************
1614 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1617 if(qscores.getName() != ""){
1618 qscores.trimQScores(-1, keepFirst);
1620 sequence.trim(keepFirst);
1623 catch(exception& e) {
1624 m->errorOut(e, "keepFirstTrim", "countDiffs");
1630 //***************************************************************************************************************
1632 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1636 int length = sequence.getNumBases() - removeLast;
1639 if(qscores.getName() != ""){
1640 qscores.trimQScores(-1, length);
1642 sequence.trim(length);
1651 catch(exception& e) {
1652 m->errorOut(e, "removeLastTrim", "countDiffs");
1658 //***************************************************************************************************************
1660 bool TrimSeqsCommand::cullLength(Sequence& seq){
1663 int length = seq.getNumBases();
1664 bool success = 0; //guilty until proven innocent
1666 if(length >= minLength && maxLength == 0) { success = 1; }
1667 else if(length >= minLength && length <= maxLength) { success = 1; }
1668 else { success = 0; }
1673 catch(exception& e) {
1674 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1680 //***************************************************************************************************************
1682 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1684 int longHomoP = seq.getLongHomoPolymer();
1685 bool success = 0; //guilty until proven innocent
1687 if(longHomoP <= maxHomoP){ success = 1; }
1688 else { success = 0; }
1692 catch(exception& e) {
1693 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1698 //********************************************************************/
1699 string TrimSeqsCommand::reverseOligo(string oligo){
1701 string reverse = "";
1703 for(int i=oligo.length()-1;i>=0;i--){
1705 if(oligo[i] == 'A') { reverse += 'T'; }
1706 else if(oligo[i] == 'T'){ reverse += 'A'; }
1707 else if(oligo[i] == 'U'){ reverse += 'A'; }
1709 else if(oligo[i] == 'G'){ reverse += 'C'; }
1710 else if(oligo[i] == 'C'){ reverse += 'G'; }
1712 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1713 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1715 else if(oligo[i] == 'M'){ reverse += 'K'; }
1716 else if(oligo[i] == 'K'){ reverse += 'M'; }
1718 else if(oligo[i] == 'W'){ reverse += 'W'; }
1719 else if(oligo[i] == 'S'){ reverse += 'S'; }
1721 else if(oligo[i] == 'B'){ reverse += 'V'; }
1722 else if(oligo[i] == 'V'){ reverse += 'B'; }
1724 else if(oligo[i] == 'D'){ reverse += 'H'; }
1725 else if(oligo[i] == 'H'){ reverse += 'D'; }
1727 else { reverse += 'N'; }
1733 catch(exception& e) {
1734 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1739 //***************************************************************************************************************
1741 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1743 int numNs = seq.getAmbigBases();
1744 bool success = 0; //guilty until proven innocent
1746 if(numNs <= maxAmbig) { success = 1; }
1747 else { success = 0; }
1751 catch(exception& e) {
1752 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1757 //***************************************************************************************************************