5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
15 //**********************************************************************************************************************
16 vector<string> TrimSeqsCommand::setParameters(){
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
19 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
20 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false,true); parameters.push_back(pqfile);
21 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname);
22 CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount);
23 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(pflip);
24 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig);
25 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxhomop);
26 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pminlength);
27 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxlength);
28 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
29 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
30 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs);
31 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs);
32 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
33 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
34 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles);
35 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepforward);
36 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pqtrim);
37 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqthreshold);
38 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqaverage);
39 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(prollaverage);
40 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqwindowaverage);
41 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pqstepsize);
42 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "","",false,false); parameters.push_back(pqwindowsize);
43 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pkeepfirst);
44 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "","",false,false); parameters.push_back(premovelast);
45 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
46 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
48 vector<string> myArray;
49 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
53 m->errorOut(e, "TrimSeqsCommand", "setParameters");
57 //**********************************************************************************************************************
58 string TrimSeqsCommand::getHelpString(){
60 string helpString = "";
61 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
62 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
63 helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
64 helpString += "The fasta parameter is required.\n";
65 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
66 helpString += "The oligos parameter allows you to provide an oligos file.\n";
67 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
68 helpString += "The count parameter allows you to provide a count file with your fasta file.\n";
69 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
70 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
71 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
72 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
73 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
74 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
75 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
76 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
77 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
78 helpString += "The qfile parameter allows you to provide a quality file.\n";
79 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
80 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
81 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
82 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
83 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
84 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
85 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
86 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
87 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
88 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
89 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
90 helpString += "The trim.seqs command should be in the following format: \n";
91 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
92 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
93 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
94 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
95 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
99 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
103 //**********************************************************************************************************************
104 string TrimSeqsCommand::getOutputPattern(string type) {
108 if (type == "qfile") { pattern = "[filename],[tag],qual"; }
109 else if (type == "fasta") { pattern = "[filename],[tag],fasta"; }
110 else if (type == "group") { pattern = "[filename],groups"; }
111 else if (type == "name") { pattern = "[filename],[tag],names"; }
112 else if (type == "count") { pattern = "[filename],[tag],count_table-[filename],count_table"; }
113 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
117 catch(exception& e) {
118 m->errorOut(e, "TrimSeqsCommand", "getOutputPattern");
122 //**********************************************************************************************************************
124 TrimSeqsCommand::TrimSeqsCommand(){
126 abort = true; calledHelp = true;
128 vector<string> tempOutNames;
129 outputTypes["fasta"] = tempOutNames;
130 outputTypes["qfile"] = tempOutNames;
131 outputTypes["group"] = tempOutNames;
132 outputTypes["name"] = tempOutNames;
133 outputTypes["count"] = tempOutNames;
135 catch(exception& e) {
136 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
140 //***************************************************************************************************************
142 TrimSeqsCommand::TrimSeqsCommand(string option) {
145 abort = false; calledHelp = false;
148 //allow user to run help
149 if(option == "help") { help(); abort = true; calledHelp = true; }
150 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
153 vector<string> myArray = setParameters();
155 OptionParser parser(option);
156 map<string,string> parameters = parser.getParameters();
158 ValidParameters validParameter;
159 map<string,string>::iterator it;
161 //check to make sure all parameters are valid for command
162 for (it = parameters.begin(); it != parameters.end(); it++) {
163 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
166 //initialize outputTypes
167 vector<string> tempOutNames;
168 outputTypes["fasta"] = tempOutNames;
169 outputTypes["qfile"] = tempOutNames;
170 outputTypes["group"] = tempOutNames;
171 outputTypes["name"] = tempOutNames;
172 outputTypes["count"] = tempOutNames;
174 //if the user changes the input directory command factory will send this info to us in the output parameter
175 string inputDir = validParameter.validFile(parameters, "inputdir", false);
176 if (inputDir == "not found"){ inputDir = ""; }
179 it = parameters.find("fasta");
180 //user has given a template file
181 if(it != parameters.end()){
182 path = m->hasPath(it->second);
183 //if the user has not given a path then, add inputdir. else leave path alone.
184 if (path == "") { parameters["fasta"] = inputDir + it->second; }
187 it = parameters.find("oligos");
188 //user has given a template file
189 if(it != parameters.end()){
190 path = m->hasPath(it->second);
191 //if the user has not given a path then, add inputdir. else leave path alone.
192 if (path == "") { parameters["oligos"] = inputDir + it->second; }
195 it = parameters.find("qfile");
196 //user has given a template file
197 if(it != parameters.end()){
198 path = m->hasPath(it->second);
199 //if the user has not given a path then, add inputdir. else leave path alone.
200 if (path == "") { parameters["qfile"] = inputDir + it->second; }
203 it = parameters.find("name");
204 //user has given a template file
205 if(it != parameters.end()){
206 path = m->hasPath(it->second);
207 //if the user has not given a path then, add inputdir. else leave path alone.
208 if (path == "") { parameters["name"] = inputDir + it->second; }
211 it = parameters.find("count");
212 //user has given a template file
213 if(it != parameters.end()){
214 path = m->hasPath(it->second);
215 //if the user has not given a path then, add inputdir. else leave path alone.
216 if (path == "") { parameters["count"] = inputDir + it->second; }
222 //check for required parameters
223 fastaFile = validParameter.validFile(parameters, "fasta", true);
224 if (fastaFile == "not found") {
225 fastaFile = m->getFastaFile();
226 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
227 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
228 }else if (fastaFile == "not open") { abort = true; }
229 else { m->setFastaFile(fastaFile); }
231 //if the user changes the output directory command factory will send this info to us in the output parameter
232 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
234 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
238 //check for optional parameter and set defaults
239 // ...at some point should added some additional type checking...
241 temp = validParameter.validFile(parameters, "flip", false);
242 if (temp == "not found") { flip = 0; }
243 else { flip = m->isTrue(temp); }
245 temp = validParameter.validFile(parameters, "oligos", true);
246 if (temp == "not found"){ oligoFile = ""; }
247 else if(temp == "not open"){ abort = true; }
248 else { oligoFile = temp; m->setOligosFile(oligoFile); }
251 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
252 m->mothurConvert(temp, maxAmbig);
254 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
255 m->mothurConvert(temp, maxHomoP);
257 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
258 m->mothurConvert(temp, minLength);
260 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
261 m->mothurConvert(temp, maxLength);
263 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
264 m->mothurConvert(temp, bdiffs);
266 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
267 m->mothurConvert(temp, pdiffs);
269 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
270 m->mothurConvert(temp, ldiffs);
272 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
273 m->mothurConvert(temp, sdiffs);
275 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
276 m->mothurConvert(temp, tdiffs);
278 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
280 temp = validParameter.validFile(parameters, "qfile", true);
281 if (temp == "not found") { qFileName = ""; }
282 else if(temp == "not open") { abort = true; }
283 else { qFileName = temp; m->setQualFile(qFileName); }
285 temp = validParameter.validFile(parameters, "name", true);
286 if (temp == "not found") { nameFile = ""; }
287 else if(temp == "not open") { nameFile = ""; abort = true; }
288 else { nameFile = temp; m->setNameFile(nameFile); }
290 countfile = validParameter.validFile(parameters, "count", true);
291 if (countfile == "not open") { abort = true; countfile = ""; }
292 else if (countfile == "not found") { countfile = ""; }
293 else { m->setCountTableFile(countfile); }
295 if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
297 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
298 m->mothurConvert(temp, qThreshold);
300 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
301 qtrim = m->isTrue(temp);
303 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
304 convert(temp, qRollAverage);
306 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
307 convert(temp, qWindowAverage);
309 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
310 convert(temp, qWindowSize);
312 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
313 convert(temp, qWindowStep);
315 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
316 convert(temp, qAverage);
318 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
319 convert(temp, keepFirst);
321 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
322 convert(temp, removeLast);
324 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
325 allFiles = m->isTrue(temp);
327 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
328 keepforward = m->isTrue(temp);
330 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
331 m->setProcessors(temp);
332 m->mothurConvert(temp, processors);
335 if(allFiles && (oligoFile == "")){
336 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
338 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
339 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
343 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
344 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
348 if (countfile == "") {
349 if (nameFile == "") {
350 vector<string> files; files.push_back(fastaFile);
351 parser.getNameFile(files);
357 catch(exception& e) {
358 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
362 //***************************************************************************************************************
364 int TrimSeqsCommand::execute(){
367 if (abort == true) { if (calledHelp) { return 0; } return 2; }
369 numFPrimers = 0; //this needs to be initialized
374 vector<vector<string> > fastaFileNames;
375 vector<vector<string> > qualFileNames;
376 vector<vector<string> > nameFileNames;
378 map<string, string> variables;
379 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
380 variables["[tag]"] = "trim";
381 string trimSeqFile = getOutputFileName("fasta",variables);
382 string trimQualFile = getOutputFileName("qfile",variables);
383 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
385 variables["[tag]"] = "scrap";
386 string scrapSeqFile = getOutputFileName("fasta",variables);
387 string scrapQualFile = getOutputFileName("qfile",variables);
388 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
390 if (qFileName != "") {
391 outputNames.push_back(trimQualFile);
392 outputNames.push_back(scrapQualFile);
393 outputTypes["qfile"].push_back(trimQualFile);
394 outputTypes["qfile"].push_back(scrapQualFile);
397 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
398 variables["[tag]"] = "trim";
399 string trimNameFile = getOutputFileName("name",variables);
400 variables["[tag]"] = "scrap";
401 string scrapNameFile = getOutputFileName("name",variables);
403 if (nameFile != "") {
404 m->readNames(nameFile, nameMap);
405 outputNames.push_back(trimNameFile);
406 outputNames.push_back(scrapNameFile);
407 outputTypes["name"].push_back(trimNameFile);
408 outputTypes["name"].push_back(scrapNameFile);
411 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
412 variables["[tag]"] = "trim";
413 string trimCountFile = getOutputFileName("count",variables);
414 variables["[tag]"] = "scrap";
415 string scrapCountFile = getOutputFileName("count",variables);
417 if (countfile != "") {
419 ct.readTable(countfile);
420 nameCount = ct.getNameMap();
421 outputNames.push_back(trimCountFile);
422 outputNames.push_back(scrapCountFile);
423 outputTypes["count"].push_back(trimCountFile);
424 outputTypes["count"].push_back(scrapCountFile);
428 if (m->control_pressed) { return 0; }
430 string outputGroupFileName;
432 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
433 if ((createGroup) && (countfile == "")){
434 map<string, string> myvariables;
435 myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
436 outputGroupFileName = getOutputFileName("group",myvariables);
437 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
441 //fills lines and qlines
442 setLines(fastaFile, qFileName);
445 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
447 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
451 if (m->control_pressed) { return 0; }
454 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
455 map<string, string>::iterator it;
456 set<string> namesToRemove;
457 for(int i=0;i<fastaFileNames.size();i++){
458 for(int j=0;j<fastaFileNames[0].size();j++){
459 if (fastaFileNames[i][j] != "") {
460 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
461 if(m->isBlank(fastaFileNames[i][j])){
462 m->mothurRemove(fastaFileNames[i][j]);
463 namesToRemove.insert(fastaFileNames[i][j]);
466 m->mothurRemove(qualFileNames[i][j]);
467 namesToRemove.insert(qualFileNames[i][j]);
471 m->mothurRemove(nameFileNames[i][j]);
472 namesToRemove.insert(nameFileNames[i][j]);
475 it = uniqueFastaNames.find(fastaFileNames[i][j]);
476 if (it == uniqueFastaNames.end()) {
477 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
485 //remove names for outputFileNames, just cleans up the output
486 vector<string> outputNames2;
487 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
488 outputNames = outputNames2;
490 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
492 m->openInputFile(it->first, in);
495 map<string, string> myvariables;
496 myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(it->first));
497 string thisGroupName = "";
498 if (countfile == "") { thisGroupName = getOutputFileName("group",myvariables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
499 else { thisGroupName = getOutputFileName("count",myvariables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
500 m->openOutputFile(thisGroupName, out);
502 if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; }
505 if (m->control_pressed) { break; }
507 Sequence currSeq(in); m->gobble(in);
508 if (countfile == "") {
509 out << currSeq.getName() << '\t' << it->second << endl;
511 if (nameFile != "") {
512 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
513 if (itName != nameMap.end()) {
514 vector<string> thisSeqsNames;
515 m->splitAtChar(itName->second, thisSeqsNames, ',');
516 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
517 out << thisSeqsNames[k] << '\t' << it->second << endl;
519 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
522 map<string, int>::iterator itTotalReps = nameCount.find(currSeq.getName());
523 if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; }
524 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
531 if (countfile != "") { //create countfile with group info included
532 CountTable* ct = new CountTable();
533 ct->readTable(trimCountFile);
534 map<string, int> justTrimmedNames = ct->getNameMap();
538 for (map<string, int>::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); }
539 vector<int> tempCounts; tempCounts.resize(groupCounts.size(), 0);
540 for (map<string, int>::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) {
541 newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance
542 map<string, string>::iterator it2 = groupMap.find(itNames->first);
543 if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); }
544 else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; }
546 newCt.printTable(trimCountFile);
550 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
552 //output group counts
553 m->mothurOutEndLine();
555 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
556 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
557 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
559 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
561 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
563 //set fasta file as new current fastafile
565 itTypes = outputTypes.find("fasta");
566 if (itTypes != outputTypes.end()) {
567 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
570 itTypes = outputTypes.find("name");
571 if (itTypes != outputTypes.end()) {
572 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
575 itTypes = outputTypes.find("qfile");
576 if (itTypes != outputTypes.end()) {
577 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
580 itTypes = outputTypes.find("group");
581 if (itTypes != outputTypes.end()) {
582 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
585 itTypes = outputTypes.find("count");
586 if (itTypes != outputTypes.end()) {
587 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
590 m->mothurOutEndLine();
591 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
592 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
593 m->mothurOutEndLine();
598 catch(exception& e) {
599 m->errorOut(e, "TrimSeqsCommand", "execute");
604 /**************************************************************************************/
605 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
609 ofstream trimFASTAFile;
610 m->openOutputFile(trimFileName, trimFASTAFile);
612 ofstream scrapFASTAFile;
613 m->openOutputFile(scrapFileName, scrapFASTAFile);
615 ofstream trimQualFile;
616 ofstream scrapQualFile;
618 m->openOutputFile(trimQFileName, trimQualFile);
619 m->openOutputFile(scrapQFileName, scrapQualFile);
622 ofstream trimNameFile;
623 ofstream scrapNameFile;
625 m->openOutputFile(trimNFileName, trimNameFile);
626 m->openOutputFile(scrapNFileName, scrapNameFile);
629 ofstream trimCountFile;
630 ofstream scrapCountFile;
632 m->openOutputFile(trimCFileName, trimCountFile);
633 m->openOutputFile(scrapCFileName, scrapCountFile);
634 if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
637 ofstream outGroupsFile;
638 if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); }
640 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
641 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
642 if (fastaFileNames[i][j] != "") {
644 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
646 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
650 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
658 m->openInputFile(filename, inFASTA);
659 inFASTA.seekg(line.start);
662 if(qFileName != "") {
663 m->openInputFile(qFileName, qFile);
664 qFile.seekg(qline.start);
669 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
673 if (m->control_pressed) {
674 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
675 if ((createGroup) && (countfile == "")) { outGroupsFile.close(); }
676 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
677 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
678 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
679 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;
683 string trashCode = "";
684 int currentSeqsDiffs = 0;
686 Sequence currSeq(inFASTA); m->gobble(inFASTA);
687 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
689 QualityScores currQual;
691 currQual = QualityScores(qFile); m->gobble(qFile);
692 //cout << currQual.getName() << endl;
695 string origSeq = currSeq.getUnaligned();
698 int barcodeIndex = 0;
702 success = trimOligos.stripLinker(currSeq, currQual);
703 if(success > ldiffs) { trashCode += 'k'; }
704 else{ currentSeqsDiffs += success; }
708 if(barcodes.size() != 0){
709 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
710 if(success > bdiffs) { trashCode += 'b'; }
711 else{ currentSeqsDiffs += success; }
715 success = trimOligos.stripSpacer(currSeq, currQual);
716 if(success > sdiffs) { trashCode += 's'; }
717 else{ currentSeqsDiffs += success; }
721 if(numFPrimers != 0){
722 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
723 if(success > pdiffs) { trashCode += 'f'; }
724 else{ currentSeqsDiffs += success; }
727 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
729 if(numRPrimers != 0){
730 success = trimOligos.stripReverse(currSeq, currQual);
731 if(!success) { trashCode += 'r'; }
735 success = keepFirstTrim(currSeq, currQual);
739 success = removeLastTrim(currSeq, currQual);
740 if(!success) { trashCode += 'l'; }
745 int origLength = currSeq.getNumBases();
747 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
748 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
749 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
750 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
751 else { success = 1; }
753 //you don't want to trim, if it fails above then scrap it
754 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
756 if(!success) { trashCode += 'q'; }
759 if(minLength > 0 || maxLength > 0){
760 success = cullLength(currSeq);
761 if(!success) { trashCode += 'l'; }
764 success = cullHomoP(currSeq);
765 if(!success) { trashCode += 'h'; }
768 success = cullAmbigs(currSeq);
769 if(!success) { trashCode += 'n'; }
772 if(flip){ // should go last
773 currSeq.reverseComplement();
775 currQual.flipQScores();
779 if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
781 if(trashCode.length() == 0){
782 string thisGroup = "";
784 if(barcodes.size() != 0){
785 thisGroup = barcodeNameVector[barcodeIndex];
786 if (primers.size() != 0) {
787 if (primerNameVector[primerIndex] != "") {
788 if(thisGroup != "") {
789 thisGroup += "." + primerNameVector[primerIndex];
791 thisGroup = primerNameVector[primerIndex];
798 int pos = thisGroup.find("ignore");
799 if (pos == string::npos) {
800 currSeq.setAligned(currSeq.getUnaligned());
801 currSeq.printSequence(trimFASTAFile);
804 currQual.printQScores(trimQualFile);
809 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
810 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
811 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
814 int numRedundants = 0;
815 if (countfile != "") {
816 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
817 if (itCount != nameCount.end()) {
818 trimCountFile << itCount->first << '\t' << itCount->second << endl;
819 numRedundants = itCount->second-1;
820 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
824 if(barcodes.size() != 0){
826 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
828 if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
829 else { groupMap[currSeq.getName()] = thisGroup; }
831 if (nameFile != "") {
832 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
833 if (itName != nameMap.end()) {
834 vector<string> thisSeqsNames;
835 m->splitAtChar(itName->second, thisSeqsNames, ',');
836 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
837 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
838 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
840 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
843 map<string, int>::iterator it = groupCounts.find(thisGroup);
844 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
845 else { groupCounts[it->first] += (1 + numRedundants); }
852 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
853 currSeq.printSequence(output);
857 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
858 currQual.printQScores(output);
863 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
864 if (itName != nameMap.end()) {
865 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
866 output << itName->first << '\t' << itName->second << endl;
868 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
874 if(nameFile != ""){ //needs to be before the currSeq name is changed
875 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
876 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
877 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
879 if (countfile != "") {
880 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
881 if (itCount != nameCount.end()) {
882 trimCountFile << itCount->first << '\t' << itCount->second << endl;
883 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
886 currSeq.setName(currSeq.getName() + '|' + trashCode);
887 currSeq.setUnaligned(origSeq);
888 currSeq.setAligned(origSeq);
889 currSeq.printSequence(scrapFASTAFile);
891 currQual.printQScores(scrapQualFile);
897 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
898 unsigned long long pos = inFASTA.tellg();
899 if ((pos == -1) || (pos >= line.end)) { break; }
902 if (inFASTA.eof()) { break; }
906 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
910 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
914 trimFASTAFile.close();
915 scrapFASTAFile.close();
916 if (createGroup) { outGroupsFile.close(); }
917 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
918 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
919 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
923 catch(exception& e) {
924 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
929 /**************************************************************************************************/
931 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
938 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
939 //loop through and create all the processes you want
940 while (process != processors) {
944 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
948 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
949 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
950 vector<vector<string> > tempNameFileNames = nameFileNames;
955 for(int i=0;i<tempFASTAFileNames.size();i++){
956 for(int j=0;j<tempFASTAFileNames[i].size();j++){
957 if (tempFASTAFileNames[i][j] != "") {
958 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
959 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
962 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
963 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
966 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
967 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
974 driverCreateTrim(filename,
976 (trimFASTAFileName + toString(getpid()) + ".temp"),
977 (scrapFASTAFileName + toString(getpid()) + ".temp"),
978 (trimQualFileName + toString(getpid()) + ".temp"),
979 (scrapQualFileName + toString(getpid()) + ".temp"),
980 (trimNameFileName + toString(getpid()) + ".temp"),
981 (scrapNameFileName + toString(getpid()) + ".temp"),
982 (trimCountFileName + toString(getpid()) + ".temp"),
983 (scrapCountFileName + toString(getpid()) + ".temp"),
984 (groupFile + toString(getpid()) + ".temp"),
986 tempPrimerQualFileNames,
991 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
993 //pass groupCounts to parent
996 string tempFile = filename + toString(getpid()) + ".num.temp";
997 m->openOutputFile(tempFile, out);
999 out << groupCounts.size() << endl;
1001 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
1002 out << it->first << '\t' << it->second << endl;
1005 out << groupMap.size() << endl;
1006 for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
1007 out << it->first << '\t' << it->second << endl;
1013 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1014 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1021 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1022 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1023 if(qFileName != ""){
1024 m->openOutputFile(trimQualFileName, temp); temp.close();
1025 m->openOutputFile(scrapQualFileName, temp); temp.close();
1027 if (nameFile != "") {
1028 m->openOutputFile(trimNameFileName, temp); temp.close();
1029 m->openOutputFile(scrapNameFileName, temp); temp.close();
1031 if (countfile != "") {
1032 m->openOutputFile(trimCountFileName, temp); temp.close();
1033 m->openOutputFile(scrapCountFileName, temp); temp.close();
1036 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
1038 //force parent to wait until all the processes are done
1039 for (int i=0;i<processIDS.size();i++) {
1040 int temp = processIDS[i];
1044 //////////////////////////////////////////////////////////////////////////////////////////////////////
1045 //Windows version shared memory, so be careful when passing variables through the trimData struct.
1046 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1047 //////////////////////////////////////////////////////////////////////////////////////////////////////
1049 vector<trimData*> pDataArray;
1050 DWORD dwThreadIdArray[processors-1];
1051 HANDLE hThreadArray[processors-1];
1053 //Create processor worker threads.
1054 for( int h=0; h<processors-1; h++){
1056 string extension = "";
1057 if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
1058 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1059 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1060 vector<vector<string> > tempNameFileNames = nameFileNames;
1065 for(int i=0;i<tempFASTAFileNames.size();i++){
1066 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1067 if (tempFASTAFileNames[i][j] != "") {
1068 tempFASTAFileNames[i][j] += extension;
1069 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1071 if(qFileName != ""){
1072 tempPrimerQualFileNames[i][j] += extension;
1073 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1076 tempNameFileNames[i][j] += extension;
1077 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1085 trimData* tempTrim = new trimData(filename,
1086 qFileName, nameFile, countfile,
1087 (trimFASTAFileName+extension),
1088 (scrapFASTAFileName+extension),
1089 (trimQualFileName+extension),
1090 (scrapQualFileName+extension),
1091 (trimNameFileName+extension),
1092 (scrapNameFileName+extension),
1093 (trimCountFileName+extension),
1094 (scrapCountFileName+extension),
1095 (groupFile+extension),
1097 tempPrimerQualFileNames,
1099 lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m,
1100 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
1101 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
1102 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
1103 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
1104 pDataArray.push_back(tempTrim);
1106 hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
1111 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1112 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1113 if(qFileName != ""){
1114 m->openOutputFile(trimQualFileName, temp); temp.close();
1115 m->openOutputFile(scrapQualFileName, temp); temp.close();
1117 if (nameFile != "") {
1118 m->openOutputFile(trimNameFileName, temp); temp.close();
1119 m->openOutputFile(scrapNameFileName, temp); temp.close();
1121 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1122 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1123 vector<vector<string> > tempNameFileNames = nameFileNames;
1126 string extension = toString(processors-1) + ".temp";
1127 for(int i=0;i<tempFASTAFileNames.size();i++){
1128 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1129 if (tempFASTAFileNames[i][j] != "") {
1130 tempFASTAFileNames[i][j] += extension;
1131 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1133 if(qFileName != ""){
1134 tempPrimerQualFileNames[i][j] += extension;
1135 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1138 tempNameFileNames[i][j] += extension;
1139 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1146 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]);
1147 processIDS.push_back(processors-1);
1150 //Wait until all threads have terminated.
1151 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1153 //Close all thread handles and free memory allocations.
1154 for(int i=0; i < pDataArray.size(); i++){
1155 if (pDataArray[i]->count != pDataArray[i]->lineEnd) {
1156 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->lineEnd) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
1158 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
1159 map<string, int>::iterator it2 = groupCounts.find(it->first);
1160 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
1161 else { groupCounts[it->first] += it->second; }
1163 for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
1164 map<string, string>::iterator it2 = groupMap.find(it->first);
1165 if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
1166 else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1168 CloseHandle(hThreadArray[i]);
1169 delete pDataArray[i];
1176 for(int i=0;i<processIDS.size();i++){
1178 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1180 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1181 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1182 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1183 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1185 if(qFileName != ""){
1186 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1187 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1188 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1189 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1193 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1194 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1195 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1196 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1199 if(countfile != ""){
1200 m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName);
1201 m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp"));
1202 m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName);
1203 m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp"));
1206 if((createGroup)&&(countfile == "")){
1207 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1208 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1213 for(int j=0;j<fastaFileNames.size();j++){
1214 for(int k=0;k<fastaFileNames[j].size();k++){
1215 if (fastaFileNames[j][k] != "") {
1216 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1217 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1219 if(qFileName != ""){
1220 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1221 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1225 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1226 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1233 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1236 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1237 m->openInputFile(tempFile, in);
1241 in >> tempNum; m->gobble(in);
1244 for (int i = 0; i < tempNum; i++) {
1246 in >> group >> groupNum; m->gobble(in);
1248 map<string, int>::iterator it = groupCounts.find(group);
1249 if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
1250 else { groupCounts[it->first] += groupNum; }
1253 in >> tempNum; m->gobble(in);
1255 for (int i = 0; i < tempNum; i++) {
1256 string group, seqName;
1257 in >> seqName >> group; m->gobble(in);
1259 map<string, string>::iterator it = groupMap.find(seqName);
1260 if (it == groupMap.end()) { groupMap[seqName] = group; }
1261 else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1265 in.close(); m->mothurRemove(tempFile);
1272 catch(exception& e) {
1273 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1278 /**************************************************************************************************/
1280 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1283 vector<unsigned long long> fastaFilePos;
1284 vector<unsigned long long> qfileFilePos;
1286 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1287 //set file positions for fasta file
1288 fastaFilePos = m->divideFile(filename, processors);
1290 //get name of first sequence in each chunk
1291 map<string, int> firstSeqNames;
1292 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1294 m->openInputFile(filename, in);
1295 in.seekg(fastaFilePos[i]);
1298 firstSeqNames[temp.getName()] = i;
1303 if(qfilename != "") {
1304 //seach for filePos of each first name in the qfile and save in qfileFilePos
1306 m->openInputFile(qfilename, inQual);
1309 while(!inQual.eof()){
1310 input = m->getline(inQual);
1312 if (input.length() != 0) {
1313 if(input[0] == '>'){ //this is a sequence name line
1314 istringstream nameStream(input);
1316 string sname = ""; nameStream >> sname;
1317 sname = sname.substr(1);
1319 map<string, int>::iterator it = firstSeqNames.find(sname);
1321 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1322 unsigned long long pos = inQual.tellg();
1323 qfileFilePos.push_back(pos - input.length() - 1);
1324 firstSeqNames.erase(it);
1329 if (firstSeqNames.size() == 0) { break; }
1334 if (firstSeqNames.size() != 0) {
1335 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1336 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1342 //get last file position of qfile
1344 unsigned long long size;
1346 //get num bytes in file
1347 pFile = fopen (qfilename.c_str(),"rb");
1348 if (pFile==NULL) perror ("Error opening file");
1350 fseek (pFile, 0, SEEK_END);
1355 qfileFilePos.push_back(size);
1358 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1359 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
1360 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1361 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1363 if(qfilename == "") { qLines = lines; } //files with duds
1369 if (processors == 1) { //save time
1370 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1371 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1372 lines.push_back(linePair(0, 1000));
1373 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1375 int numFastaSeqs = 0;
1376 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1377 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1379 if (qfilename != "") {
1380 int numQualSeqs = 0;
1381 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1383 if (numFastaSeqs != numQualSeqs) {
1384 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1388 //figure out how many sequences you have to process
1389 int numSeqsPerProcessor = numFastaSeqs / processors;
1390 for (int i = 0; i < processors; i++) {
1391 int startIndex = i * numSeqsPerProcessor;
1392 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1393 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1394 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1397 if(qfilename == "") { qLines = lines; } //files with duds
1402 catch(exception& e) {
1403 m->errorOut(e, "TrimSeqsCommand", "setLines");
1408 //***************************************************************************************************************
1410 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1413 m->openInputFile(oligoFile, inOligos);
1417 string type, oligo, group;
1419 int indexPrimer = 0;
1420 int indexBarcode = 0;
1422 while(!inOligos.eof()){
1426 if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
1429 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1430 m->gobble(inOligos);
1433 m->gobble(inOligos);
1434 //make type case insensitive
1435 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1439 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
1441 for(int i=0;i<oligo.length();i++){
1442 oligo[i] = toupper(oligo[i]);
1443 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1446 if(type == "FORWARD"){
1449 // get rest of line in case there is a primer name
1450 while (!inOligos.eof()) {
1451 char c = inOligos.get();
1452 if (c == 10 || c == 13){ break; }
1453 else if (c == 32 || c == 9){;} //space or tab
1454 else { group += c; }
1457 //check for repeat barcodes
1458 map<string, int>::iterator itPrime = primers.find(oligo);
1459 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1461 if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } }
1463 primers[oligo]=indexPrimer; indexPrimer++;
1464 primerNameVector.push_back(group);
1466 else if(type == "REVERSE"){
1467 //Sequence oligoRC("reverse", oligo);
1468 //oligoRC.reverseComplement();
1469 string oligoRC = reverseOligo(oligo);
1470 revPrimer.push_back(oligoRC);
1472 else if(type == "BARCODE"){
1475 //check for repeat barcodes
1476 map<string, int>::iterator itBar = barcodes.find(oligo);
1477 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1479 barcodes[oligo]=indexBarcode; indexBarcode++;
1480 barcodeNameVector.push_back(group);
1481 }else if(type == "LINKER"){
1482 linker.push_back(oligo);
1483 }else if(type == "SPACER"){
1484 spacer.push_back(oligo);
1486 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1488 m->gobble(inOligos);
1492 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1494 //add in potential combos
1495 if(barcodeNameVector.size() == 0){
1497 barcodeNameVector.push_back("");
1500 if(primerNameVector.size() == 0){
1502 primerNameVector.push_back("");
1505 fastaFileNames.resize(barcodeNameVector.size());
1506 for(int i=0;i<fastaFileNames.size();i++){
1507 fastaFileNames[i].assign(primerNameVector.size(), "");
1509 if(qFileName != "") { qualFileNames = fastaFileNames; }
1510 if(nameFile != "") { nameFileNames = fastaFileNames; }
1513 set<string> uniqueNames; //used to cleanup outputFileNames
1514 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1515 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1517 string primerName = primerNameVector[itPrimer->second];
1518 string barcodeName = barcodeNameVector[itBar->second];
1520 if ((primerName == "ignore") || (barcodeName == "ignore")) { } //do nothing
1522 string comboGroupName = "";
1523 string fastaFileName = "";
1524 string qualFileName = "";
1525 string nameFileName = "";
1526 string countFileName = "";
1528 if(primerName == ""){
1529 comboGroupName = barcodeNameVector[itBar->second];
1532 if(barcodeName == ""){
1533 comboGroupName = primerNameVector[itPrimer->second];
1536 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1542 map<string, string> variables;
1543 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
1544 variables["[tag]"] = comboGroupName;
1545 fastaFileName = getOutputFileName("fasta", variables);
1546 if (uniqueNames.count(fastaFileName) == 0) {
1547 outputNames.push_back(fastaFileName);
1548 outputTypes["fasta"].push_back(fastaFileName);
1549 uniqueNames.insert(fastaFileName);
1552 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1553 m->openOutputFile(fastaFileName, temp); temp.close();
1555 if(qFileName != ""){
1556 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qFileName));
1557 qualFileName = getOutputFileName("qfile", variables);
1558 if (uniqueNames.count(qualFileName) == 0) {
1559 outputNames.push_back(qualFileName);
1560 outputTypes["qfile"].push_back(qualFileName);
1563 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1564 m->openOutputFile(qualFileName, temp); temp.close();
1568 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
1569 nameFileName = getOutputFileName("name", variables);
1570 if (uniqueNames.count(nameFileName) == 0) {
1571 outputNames.push_back(nameFileName);
1572 outputTypes["name"].push_back(nameFileName);
1575 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1576 m->openOutputFile(nameFileName, temp); temp.close();
1582 numFPrimers = primers.size();
1583 numRPrimers = revPrimer.size();
1584 numLinkers = linker.size();
1585 numSpacers = spacer.size();
1587 bool allBlank = true;
1588 for (int i = 0; i < barcodeNameVector.size(); i++) {
1589 if (barcodeNameVector[i] != "") {
1594 for (int i = 0; i < primerNameVector.size(); i++) {
1595 if (primerNameVector[i] != "") {
1602 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1610 catch(exception& e) {
1611 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1615 //***************************************************************************************************************
1617 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1620 if(qscores.getName() != ""){
1621 qscores.trimQScores(-1, keepFirst);
1623 sequence.trim(keepFirst);
1626 catch(exception& e) {
1627 m->errorOut(e, "keepFirstTrim", "countDiffs");
1633 //***************************************************************************************************************
1635 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1639 int length = sequence.getNumBases() - removeLast;
1642 if(qscores.getName() != ""){
1643 qscores.trimQScores(-1, length);
1645 sequence.trim(length);
1654 catch(exception& e) {
1655 m->errorOut(e, "removeLastTrim", "countDiffs");
1661 //***************************************************************************************************************
1663 bool TrimSeqsCommand::cullLength(Sequence& seq){
1666 int length = seq.getNumBases();
1667 bool success = 0; //guilty until proven innocent
1669 if(length >= minLength && maxLength == 0) { success = 1; }
1670 else if(length >= minLength && length <= maxLength) { success = 1; }
1671 else { success = 0; }
1676 catch(exception& e) {
1677 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1683 //***************************************************************************************************************
1685 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1687 int longHomoP = seq.getLongHomoPolymer();
1688 bool success = 0; //guilty until proven innocent
1690 if(longHomoP <= maxHomoP){ success = 1; }
1691 else { success = 0; }
1695 catch(exception& e) {
1696 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1701 //********************************************************************/
1702 string TrimSeqsCommand::reverseOligo(string oligo){
1704 string reverse = "";
1706 for(int i=oligo.length()-1;i>=0;i--){
1708 if(oligo[i] == 'A') { reverse += 'T'; }
1709 else if(oligo[i] == 'T'){ reverse += 'A'; }
1710 else if(oligo[i] == 'U'){ reverse += 'A'; }
1712 else if(oligo[i] == 'G'){ reverse += 'C'; }
1713 else if(oligo[i] == 'C'){ reverse += 'G'; }
1715 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1716 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1718 else if(oligo[i] == 'M'){ reverse += 'K'; }
1719 else if(oligo[i] == 'K'){ reverse += 'M'; }
1721 else if(oligo[i] == 'W'){ reverse += 'W'; }
1722 else if(oligo[i] == 'S'){ reverse += 'S'; }
1724 else if(oligo[i] == 'B'){ reverse += 'V'; }
1725 else if(oligo[i] == 'V'){ reverse += 'B'; }
1727 else if(oligo[i] == 'D'){ reverse += 'H'; }
1728 else if(oligo[i] == 'H'){ reverse += 'D'; }
1730 else { reverse += 'N'; }
1736 catch(exception& e) {
1737 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1742 //***************************************************************************************************************
1744 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1746 int numNs = seq.getAmbigBases();
1747 bool success = 0; //guilty until proven innocent
1749 if(numNs <= maxAmbig) { success = 1; }
1750 else { success = 0; }
1754 catch(exception& e) {
1755 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1760 //***************************************************************************************************************