5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
15 //**********************************************************************************************************************
16 vector<string> TrimSeqsCommand::setParameters(){
18 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
19 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
20 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false,true); parameters.push_back(pqfile);
21 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname);
22 CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount);
23 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(pflip);
24 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig);
25 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxhomop);
26 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pminlength);
27 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxlength);
28 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
29 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
30 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs);
31 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs);
32 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
33 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
34 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles);
35 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepforward);
36 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pqtrim);
37 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqthreshold);
38 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqaverage);
39 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(prollaverage);
40 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqwindowaverage);
41 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pqstepsize);
42 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "","",false,false); parameters.push_back(pqwindowsize);
43 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pkeepfirst);
44 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "","",false,false); parameters.push_back(premovelast);
45 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
46 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
48 vector<string> myArray;
49 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
53 m->errorOut(e, "TrimSeqsCommand", "setParameters");
57 //**********************************************************************************************************************
58 string TrimSeqsCommand::getHelpString(){
60 string helpString = "";
61 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
62 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
63 helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
64 helpString += "The fasta parameter is required.\n";
65 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
66 helpString += "The oligos parameter allows you to provide an oligos file.\n";
67 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
68 helpString += "The count parameter allows you to provide a count file with your fasta file.\n";
69 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
70 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
71 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
72 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
73 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
74 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
75 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
76 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
77 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
78 helpString += "The qfile parameter allows you to provide a quality file.\n";
79 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
80 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
81 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
82 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
83 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
84 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
85 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
86 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
87 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
88 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
89 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
90 helpString += "The trim.seqs command should be in the following format: \n";
91 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
92 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
93 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
94 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
95 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
99 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
103 //**********************************************************************************************************************
104 string TrimSeqsCommand::getOutputPattern(string type) {
108 if (type == "qfile") { pattern = "[filename],[tag],qual"; }
109 else if (type == "fasta") { pattern = "[filename],[tag],fasta"; }
110 else if (type == "group") { pattern = "[filename],groups"; }
111 else if (type == "name") { pattern = "[filename],[tag],names"; }
112 else if (type == "count") { pattern = "[filename],[tag],count_table-[filename],count_table"; }
113 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
117 catch(exception& e) {
118 m->errorOut(e, "TrimSeqsCommand", "getOutputPattern");
122 //**********************************************************************************************************************
124 TrimSeqsCommand::TrimSeqsCommand(){
126 abort = true; calledHelp = true;
128 vector<string> tempOutNames;
129 outputTypes["fasta"] = tempOutNames;
130 outputTypes["qfile"] = tempOutNames;
131 outputTypes["group"] = tempOutNames;
132 outputTypes["name"] = tempOutNames;
133 outputTypes["count"] = tempOutNames;
135 catch(exception& e) {
136 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
140 //***************************************************************************************************************
142 TrimSeqsCommand::TrimSeqsCommand(string option) {
145 abort = false; calledHelp = false;
148 //allow user to run help
149 if(option == "help") { help(); abort = true; calledHelp = true; }
150 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
153 vector<string> myArray = setParameters();
155 OptionParser parser(option);
156 map<string,string> parameters = parser.getParameters();
158 ValidParameters validParameter;
159 map<string,string>::iterator it;
161 //check to make sure all parameters are valid for command
162 for (it = parameters.begin(); it != parameters.end(); it++) {
163 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
166 //initialize outputTypes
167 vector<string> tempOutNames;
168 outputTypes["fasta"] = tempOutNames;
169 outputTypes["qfile"] = tempOutNames;
170 outputTypes["group"] = tempOutNames;
171 outputTypes["name"] = tempOutNames;
172 outputTypes["count"] = tempOutNames;
174 //if the user changes the input directory command factory will send this info to us in the output parameter
175 string inputDir = validParameter.validFile(parameters, "inputdir", false);
176 if (inputDir == "not found"){ inputDir = ""; }
179 it = parameters.find("fasta");
180 //user has given a template file
181 if(it != parameters.end()){
182 path = m->hasPath(it->second);
183 //if the user has not given a path then, add inputdir. else leave path alone.
184 if (path == "") { parameters["fasta"] = inputDir + it->second; }
187 it = parameters.find("oligos");
188 //user has given a template file
189 if(it != parameters.end()){
190 path = m->hasPath(it->second);
191 //if the user has not given a path then, add inputdir. else leave path alone.
192 if (path == "") { parameters["oligos"] = inputDir + it->second; }
195 it = parameters.find("qfile");
196 //user has given a template file
197 if(it != parameters.end()){
198 path = m->hasPath(it->second);
199 //if the user has not given a path then, add inputdir. else leave path alone.
200 if (path == "") { parameters["qfile"] = inputDir + it->second; }
203 it = parameters.find("name");
204 //user has given a template file
205 if(it != parameters.end()){
206 path = m->hasPath(it->second);
207 //if the user has not given a path then, add inputdir. else leave path alone.
208 if (path == "") { parameters["name"] = inputDir + it->second; }
211 it = parameters.find("count");
212 //user has given a template file
213 if(it != parameters.end()){
214 path = m->hasPath(it->second);
215 //if the user has not given a path then, add inputdir. else leave path alone.
216 if (path == "") { parameters["count"] = inputDir + it->second; }
222 //check for required parameters
223 fastaFile = validParameter.validFile(parameters, "fasta", true);
224 if (fastaFile == "not found") {
225 fastaFile = m->getFastaFile();
226 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
227 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
228 }else if (fastaFile == "not open") { abort = true; }
229 else { m->setFastaFile(fastaFile); }
231 //if the user changes the output directory command factory will send this info to us in the output parameter
232 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
234 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
238 //check for optional parameter and set defaults
239 // ...at some point should added some additional type checking...
241 temp = validParameter.validFile(parameters, "flip", false);
242 if (temp == "not found") { flip = 0; }
243 else { flip = m->isTrue(temp); }
245 temp = validParameter.validFile(parameters, "oligos", true);
246 if (temp == "not found"){ oligoFile = ""; }
247 else if(temp == "not open"){ abort = true; }
248 else { oligoFile = temp; m->setOligosFile(oligoFile); }
251 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
252 m->mothurConvert(temp, maxAmbig);
254 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
255 m->mothurConvert(temp, maxHomoP);
257 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
258 m->mothurConvert(temp, minLength);
260 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
261 m->mothurConvert(temp, maxLength);
263 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
264 m->mothurConvert(temp, bdiffs);
266 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
267 m->mothurConvert(temp, pdiffs);
269 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
270 m->mothurConvert(temp, ldiffs);
272 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
273 m->mothurConvert(temp, sdiffs);
275 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
276 m->mothurConvert(temp, tdiffs);
278 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
280 temp = validParameter.validFile(parameters, "qfile", true);
281 if (temp == "not found") { qFileName = ""; }
282 else if(temp == "not open") { abort = true; }
283 else { qFileName = temp; m->setQualFile(qFileName); }
285 temp = validParameter.validFile(parameters, "name", true);
286 if (temp == "not found") { nameFile = ""; }
287 else if(temp == "not open") { nameFile = ""; abort = true; }
288 else { nameFile = temp; m->setNameFile(nameFile); }
290 countfile = validParameter.validFile(parameters, "count", true);
291 if (countfile == "not open") { abort = true; countfile = ""; }
292 else if (countfile == "not found") { countfile = ""; }
293 else { m->setCountTableFile(countfile); }
295 if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
297 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
298 m->mothurConvert(temp, qThreshold);
300 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
301 qtrim = m->isTrue(temp);
303 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
304 convert(temp, qRollAverage);
306 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
307 convert(temp, qWindowAverage);
309 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
310 convert(temp, qWindowSize);
312 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
313 convert(temp, qWindowStep);
315 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
316 convert(temp, qAverage);
318 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
319 convert(temp, keepFirst);
321 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
322 convert(temp, removeLast);
324 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
325 allFiles = m->isTrue(temp);
327 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
328 keepforward = m->isTrue(temp);
330 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
331 m->setProcessors(temp);
332 m->mothurConvert(temp, processors);
335 if(allFiles && (oligoFile == "")){
336 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
338 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
339 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
343 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
344 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
348 if (countfile == "") {
349 if (nameFile == "") {
350 vector<string> files; files.push_back(fastaFile);
351 parser.getNameFile(files);
357 catch(exception& e) {
358 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
362 //***************************************************************************************************************
364 int TrimSeqsCommand::execute(){
367 if (abort == true) { if (calledHelp) { return 0; } return 2; }
369 numFPrimers = 0; //this needs to be initialized
374 vector<vector<string> > fastaFileNames;
375 vector<vector<string> > qualFileNames;
376 vector<vector<string> > nameFileNames;
378 map<string, string> variables;
379 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
380 variables["[tag]"] = "trim";
381 string trimSeqFile = getOutputFileName("fasta",variables);
382 string trimQualFile = getOutputFileName("qfile",variables);
383 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
385 variables["[tag]"] = "scrap";
386 string scrapSeqFile = getOutputFileName("fasta",variables);
387 string scrapQualFile = getOutputFileName("qfile",variables);
388 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
390 if (qFileName != "") {
391 outputNames.push_back(trimQualFile);
392 outputNames.push_back(scrapQualFile);
393 outputTypes["qfile"].push_back(trimQualFile);
394 outputTypes["qfile"].push_back(scrapQualFile);
397 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
398 variables["[tag]"] = "trim";
399 string trimNameFile = getOutputFileName("name",variables);
400 variables["[tag]"] = "scrap";
401 string scrapNameFile = getOutputFileName("name",variables);
403 if (nameFile != "") {
404 m->readNames(nameFile, nameMap);
405 outputNames.push_back(trimNameFile);
406 outputNames.push_back(scrapNameFile);
407 outputTypes["name"].push_back(trimNameFile);
408 outputTypes["name"].push_back(scrapNameFile);
411 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
412 variables["[tag]"] = "trim";
413 string trimCountFile = getOutputFileName("count",variables);
414 variables["[tag]"] = "scrap";
415 string scrapCountFile = getOutputFileName("count",variables);
417 if (countfile != "") {
419 ct.readTable(countfile);
420 nameCount = ct.getNameMap();
421 outputNames.push_back(trimCountFile);
422 outputNames.push_back(scrapCountFile);
423 outputTypes["count"].push_back(trimCountFile);
424 outputTypes["count"].push_back(scrapCountFile);
428 if (m->control_pressed) { return 0; }
430 string outputGroupFileName;
432 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
433 if ((createGroup) && (countfile == "")){
434 map<string, string> myvariables;
435 myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
436 outputGroupFileName = getOutputFileName("group",myvariables);
437 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
441 //fills lines and qlines
442 setLines(fastaFile, qFileName);
445 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
447 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
451 if (m->control_pressed) { return 0; }
454 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
455 map<string, string>::iterator it;
456 set<string> namesToRemove;
457 for(int i=0;i<fastaFileNames.size();i++){
458 for(int j=0;j<fastaFileNames[0].size();j++){
459 if (fastaFileNames[i][j] != "") {
460 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
461 if(m->isBlank(fastaFileNames[i][j])){
462 m->mothurRemove(fastaFileNames[i][j]);
463 namesToRemove.insert(fastaFileNames[i][j]);
466 m->mothurRemove(qualFileNames[i][j]);
467 namesToRemove.insert(qualFileNames[i][j]);
471 m->mothurRemove(nameFileNames[i][j]);
472 namesToRemove.insert(nameFileNames[i][j]);
475 it = uniqueFastaNames.find(fastaFileNames[i][j]);
476 if (it == uniqueFastaNames.end()) {
477 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
485 //remove names for outputFileNames, just cleans up the output
486 vector<string> outputNames2;
487 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
488 outputNames = outputNames2;
490 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
492 m->openInputFile(it->first, in);
495 map<string, string> myvariables;
496 myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(it->first));
497 string thisGroupName = "";
498 if (countfile == "") { thisGroupName = getOutputFileName("group",myvariables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
499 else { thisGroupName = getOutputFileName("count",myvariables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); }
500 m->openOutputFile(thisGroupName, out);
502 if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; }
505 if (m->control_pressed) { break; }
507 Sequence currSeq(in); m->gobble(in);
508 if (countfile == "") {
509 out << currSeq.getName() << '\t' << it->second << endl;
511 if (nameFile != "") {
512 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
513 if (itName != nameMap.end()) {
514 vector<string> thisSeqsNames;
515 m->splitAtChar(itName->second, thisSeqsNames, ',');
516 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
517 out << thisSeqsNames[k] << '\t' << it->second << endl;
519 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
522 map<string, int>::iterator itTotalReps = nameCount.find(currSeq.getName());
523 if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; }
524 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
531 if (countfile != "") { //create countfile with group info included
532 CountTable* ct = new CountTable();
533 ct->readTable(trimCountFile);
534 map<string, int> justTrimmedNames = ct->getNameMap();
538 for (map<string, int>::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); }
539 vector<int> tempCounts; tempCounts.resize(groupCounts.size(), 0);
540 for (map<string, int>::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) {
541 newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance
542 map<string, string>::iterator it2 = groupMap.find(itNames->first);
543 if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); }
544 else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; }
546 newCt.printTable(trimCountFile);
550 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
552 //output group counts
553 m->mothurOutEndLine();
555 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
556 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
557 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
559 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
561 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
563 //set fasta file as new current fastafile
565 itTypes = outputTypes.find("fasta");
566 if (itTypes != outputTypes.end()) {
567 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
570 itTypes = outputTypes.find("name");
571 if (itTypes != outputTypes.end()) {
572 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
575 itTypes = outputTypes.find("qfile");
576 if (itTypes != outputTypes.end()) {
577 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
580 itTypes = outputTypes.find("group");
581 if (itTypes != outputTypes.end()) {
582 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
585 itTypes = outputTypes.find("count");
586 if (itTypes != outputTypes.end()) {
587 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
590 m->mothurOutEndLine();
591 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
592 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
593 m->mothurOutEndLine();
598 catch(exception& e) {
599 m->errorOut(e, "TrimSeqsCommand", "execute");
604 /**************************************************************************************/
605 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
609 ofstream trimFASTAFile;
610 m->openOutputFile(trimFileName, trimFASTAFile);
612 ofstream scrapFASTAFile;
613 m->openOutputFile(scrapFileName, scrapFASTAFile);
615 ofstream trimQualFile;
616 ofstream scrapQualFile;
618 m->openOutputFile(trimQFileName, trimQualFile);
619 m->openOutputFile(scrapQFileName, scrapQualFile);
622 ofstream trimNameFile;
623 ofstream scrapNameFile;
625 m->openOutputFile(trimNFileName, trimNameFile);
626 m->openOutputFile(scrapNFileName, scrapNameFile);
629 ofstream trimCountFile;
630 ofstream scrapCountFile;
632 m->openOutputFile(trimCFileName, trimCountFile);
633 m->openOutputFile(scrapCFileName, scrapCountFile);
634 if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
637 ofstream outGroupsFile;
638 if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); }
640 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
641 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
642 if (fastaFileNames[i][j] != "") {
644 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
646 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
650 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
658 m->openInputFile(filename, inFASTA);
659 inFASTA.seekg(line.start);
662 if(qFileName != "") {
663 m->openInputFile(qFileName, qFile);
664 qFile.seekg(qline.start);
669 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
673 if (m->control_pressed) {
674 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
675 if ((createGroup) && (countfile == "")) { outGroupsFile.close(); }
676 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
677 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
678 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
679 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;
683 string trashCode = "";
684 int currentSeqsDiffs = 0;
686 Sequence currSeq(inFASTA); m->gobble(inFASTA);
687 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
689 QualityScores currQual;
691 currQual = QualityScores(qFile); m->gobble(qFile);
692 //cout << currQual.getName() << endl;
695 string origSeq = currSeq.getUnaligned();
698 int barcodeIndex = 0;
702 success = trimOligos.stripLinker(currSeq, currQual);
703 if(success > ldiffs) { trashCode += 'k'; }
704 else{ currentSeqsDiffs += success; }
708 if(barcodes.size() != 0){
709 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
710 if(success > bdiffs) { trashCode += 'b'; }
711 else{ currentSeqsDiffs += success; }
715 success = trimOligos.stripSpacer(currSeq, currQual);
716 if(success > sdiffs) { trashCode += 's'; }
717 else{ currentSeqsDiffs += success; }
721 if(numFPrimers != 0){
722 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
723 if(success > pdiffs) { trashCode += 'f'; }
724 else{ currentSeqsDiffs += success; }
727 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
729 if(numRPrimers != 0){
730 success = trimOligos.stripReverse(currSeq, currQual);
731 if(!success) { trashCode += 'r'; }
735 success = keepFirstTrim(currSeq, currQual);
739 success = removeLastTrim(currSeq, currQual);
740 if(!success) { trashCode += 'l'; }
745 int origLength = currSeq.getNumBases();
747 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
748 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
749 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
750 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
751 else { success = 1; }
753 //you don't want to trim, if it fails above then scrap it
754 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
756 if(!success) { trashCode += 'q'; }
759 if(minLength > 0 || maxLength > 0){
760 success = cullLength(currSeq);
761 if(!success) { trashCode += 'l'; }
764 success = cullHomoP(currSeq);
765 if(!success) { trashCode += 'h'; }
768 success = cullAmbigs(currSeq);
769 if(!success) { trashCode += 'n'; }
772 if(flip){ // should go last
773 currSeq.reverseComplement();
775 currQual.flipQScores();
779 if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
781 if(trashCode.length() == 0){
782 currSeq.setAligned(currSeq.getUnaligned());
783 currSeq.printSequence(trimFASTAFile);
786 currQual.printQScores(trimQualFile);
791 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
792 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
793 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
796 int numRedundants = 0;
797 if (countfile != "") {
798 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
799 if (itCount != nameCount.end()) {
800 trimCountFile << itCount->first << '\t' << itCount->second << endl;
801 numRedundants = itCount->second-1;
802 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
806 if(barcodes.size() != 0){
807 string thisGroup = barcodeNameVector[barcodeIndex];
808 if (primers.size() != 0) {
809 if (primerNameVector[primerIndex] != "") {
810 if(thisGroup != "") {
811 thisGroup += "." + primerNameVector[primerIndex];
813 thisGroup = primerNameVector[primerIndex];
818 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
820 if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
821 else { groupMap[currSeq.getName()] = thisGroup; }
823 if (nameFile != "") {
824 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
825 if (itName != nameMap.end()) {
826 vector<string> thisSeqsNames;
827 m->splitAtChar(itName->second, thisSeqsNames, ',');
828 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
829 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
830 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
832 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
835 map<string, int>::iterator it = groupCounts.find(thisGroup);
836 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
837 else { groupCounts[it->first] += (1 + numRedundants); }
844 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
845 currSeq.printSequence(output);
849 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
850 currQual.printQScores(output);
855 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
856 if (itName != nameMap.end()) {
857 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
858 output << itName->first << '\t' << itName->second << endl;
860 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
865 if(nameFile != ""){ //needs to be before the currSeq name is changed
866 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
867 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
868 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
870 if (countfile != "") {
871 map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
872 if (itCount != nameCount.end()) {
873 trimCountFile << itCount->first << '\t' << itCount->second << endl;
874 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
877 currSeq.setName(currSeq.getName() + '|' + trashCode);
878 currSeq.setUnaligned(origSeq);
879 currSeq.setAligned(origSeq);
880 currSeq.printSequence(scrapFASTAFile);
882 currQual.printQScores(scrapQualFile);
888 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
889 unsigned long long pos = inFASTA.tellg();
890 if ((pos == -1) || (pos >= line.end)) { break; }
893 if (inFASTA.eof()) { break; }
897 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
901 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
905 trimFASTAFile.close();
906 scrapFASTAFile.close();
907 if (createGroup) { outGroupsFile.close(); }
908 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
909 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
910 if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
914 catch(exception& e) {
915 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
920 /**************************************************************************************************/
922 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
929 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
930 //loop through and create all the processes you want
931 while (process != processors) {
935 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
939 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
940 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
941 vector<vector<string> > tempNameFileNames = nameFileNames;
946 for(int i=0;i<tempFASTAFileNames.size();i++){
947 for(int j=0;j<tempFASTAFileNames[i].size();j++){
948 if (tempFASTAFileNames[i][j] != "") {
949 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
950 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
953 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
954 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
957 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
958 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
965 driverCreateTrim(filename,
967 (trimFASTAFileName + toString(getpid()) + ".temp"),
968 (scrapFASTAFileName + toString(getpid()) + ".temp"),
969 (trimQualFileName + toString(getpid()) + ".temp"),
970 (scrapQualFileName + toString(getpid()) + ".temp"),
971 (trimNameFileName + toString(getpid()) + ".temp"),
972 (scrapNameFileName + toString(getpid()) + ".temp"),
973 (trimCountFileName + toString(getpid()) + ".temp"),
974 (scrapCountFileName + toString(getpid()) + ".temp"),
975 (groupFile + toString(getpid()) + ".temp"),
977 tempPrimerQualFileNames,
982 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
984 //pass groupCounts to parent
987 string tempFile = filename + toString(getpid()) + ".num.temp";
988 m->openOutputFile(tempFile, out);
990 out << groupCounts.size() << endl;
992 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
993 out << it->first << '\t' << it->second << endl;
996 out << groupMap.size() << endl;
997 for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
998 out << it->first << '\t' << it->second << endl;
1004 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1005 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1012 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1013 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1014 if(qFileName != ""){
1015 m->openOutputFile(trimQualFileName, temp); temp.close();
1016 m->openOutputFile(scrapQualFileName, temp); temp.close();
1018 if (nameFile != "") {
1019 m->openOutputFile(trimNameFileName, temp); temp.close();
1020 m->openOutputFile(scrapNameFileName, temp); temp.close();
1022 if (countfile != "") {
1023 m->openOutputFile(trimCountFileName, temp); temp.close();
1024 m->openOutputFile(scrapCountFileName, temp); temp.close();
1027 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
1029 //force parent to wait until all the processes are done
1030 for (int i=0;i<processIDS.size();i++) {
1031 int temp = processIDS[i];
1035 //////////////////////////////////////////////////////////////////////////////////////////////////////
1036 //Windows version shared memory, so be careful when passing variables through the trimData struct.
1037 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1038 //////////////////////////////////////////////////////////////////////////////////////////////////////
1040 vector<trimData*> pDataArray;
1041 DWORD dwThreadIdArray[processors-1];
1042 HANDLE hThreadArray[processors-1];
1044 //Create processor worker threads.
1045 for( int h=0; h<processors-1; h++){
1047 string extension = "";
1048 if (h != 0) { extension = toString(h) + ".temp"; processIDS.push_back(h); }
1049 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1050 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1051 vector<vector<string> > tempNameFileNames = nameFileNames;
1056 for(int i=0;i<tempFASTAFileNames.size();i++){
1057 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1058 if (tempFASTAFileNames[i][j] != "") {
1059 tempFASTAFileNames[i][j] += extension;
1060 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1062 if(qFileName != ""){
1063 tempPrimerQualFileNames[i][j] += extension;
1064 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1067 tempNameFileNames[i][j] += extension;
1068 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1076 trimData* tempTrim = new trimData(filename,
1077 qFileName, nameFile, countfile,
1078 (trimFASTAFileName+extension),
1079 (scrapFASTAFileName+extension),
1080 (trimQualFileName+extension),
1081 (scrapQualFileName+extension),
1082 (trimNameFileName+extension),
1083 (scrapNameFileName+extension),
1084 (trimCountFileName+extension),
1085 (scrapCountFileName+extension),
1086 (groupFile+extension),
1088 tempPrimerQualFileNames,
1090 lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m,
1091 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
1092 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
1093 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
1094 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
1095 pDataArray.push_back(tempTrim);
1097 hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]);
1102 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1103 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1104 if(qFileName != ""){
1105 m->openOutputFile(trimQualFileName, temp); temp.close();
1106 m->openOutputFile(scrapQualFileName, temp); temp.close();
1108 if (nameFile != "") {
1109 m->openOutputFile(trimNameFileName, temp); temp.close();
1110 m->openOutputFile(scrapNameFileName, temp); temp.close();
1112 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
1113 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
1114 vector<vector<string> > tempNameFileNames = nameFileNames;
1117 string extension = toString(processors-1) + ".temp";
1118 for(int i=0;i<tempFASTAFileNames.size();i++){
1119 for(int j=0;j<tempFASTAFileNames[i].size();j++){
1120 if (tempFASTAFileNames[i][j] != "") {
1121 tempFASTAFileNames[i][j] += extension;
1122 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
1124 if(qFileName != ""){
1125 tempPrimerQualFileNames[i][j] += extension;
1126 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
1129 tempNameFileNames[i][j] += extension;
1130 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
1137 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]);
1138 processIDS.push_back(processors-1);
1141 //Wait until all threads have terminated.
1142 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1144 //Close all thread handles and free memory allocations.
1145 for(int i=0; i < pDataArray.size(); i++){
1146 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
1147 map<string, int>::iterator it2 = groupCounts.find(it->first);
1148 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
1149 else { groupCounts[it->first] += it->second; }
1151 for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
1152 map<string, string>::iterator it2 = groupMap.find(it->first);
1153 if (it2 == groupMap.end()) { groupMap[it->first] = it->second; }
1154 else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1156 CloseHandle(hThreadArray[i]);
1157 delete pDataArray[i];
1164 for(int i=0;i<processIDS.size();i++){
1166 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1168 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1169 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1170 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1171 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1173 if(qFileName != ""){
1174 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1175 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1176 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1177 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1181 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1182 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1183 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1184 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1187 if(countfile != ""){
1188 m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName);
1189 m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp"));
1190 m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName);
1191 m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp"));
1194 if((createGroup)&&(countfile == "")){
1195 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1196 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1201 for(int j=0;j<fastaFileNames.size();j++){
1202 for(int k=0;k<fastaFileNames[j].size();k++){
1203 if (fastaFileNames[j][k] != "") {
1204 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1205 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1207 if(qFileName != ""){
1208 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1209 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1213 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1214 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1221 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1224 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1225 m->openInputFile(tempFile, in);
1229 in >> tempNum; m->gobble(in);
1232 for (int i = 0; i < tempNum; i++) {
1234 in >> group >> groupNum; m->gobble(in);
1236 map<string, int>::iterator it = groupCounts.find(group);
1237 if (it == groupCounts.end()) { groupCounts[group] = groupNum; }
1238 else { groupCounts[it->first] += groupNum; }
1241 in >> tempNum; m->gobble(in);
1243 for (int i = 0; i < tempNum; i++) {
1244 string group, seqName;
1245 in >> seqName >> group; m->gobble(in);
1247 map<string, string>::iterator it = groupMap.find(seqName);
1248 if (it == groupMap.end()) { groupMap[seqName] = group; }
1249 else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); }
1253 in.close(); m->mothurRemove(tempFile);
1260 catch(exception& e) {
1261 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1266 /**************************************************************************************************/
1268 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1271 vector<unsigned long long> fastaFilePos;
1272 vector<unsigned long long> qfileFilePos;
1274 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1275 //set file positions for fasta file
1276 fastaFilePos = m->divideFile(filename, processors);
1278 //get name of first sequence in each chunk
1279 map<string, int> firstSeqNames;
1280 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1282 m->openInputFile(filename, in);
1283 in.seekg(fastaFilePos[i]);
1286 firstSeqNames[temp.getName()] = i;
1291 if(qfilename != "") {
1292 //seach for filePos of each first name in the qfile and save in qfileFilePos
1294 m->openInputFile(qfilename, inQual);
1297 while(!inQual.eof()){
1298 input = m->getline(inQual);
1300 if (input.length() != 0) {
1301 if(input[0] == '>'){ //this is a sequence name line
1302 istringstream nameStream(input);
1304 string sname = ""; nameStream >> sname;
1305 sname = sname.substr(1);
1307 map<string, int>::iterator it = firstSeqNames.find(sname);
1309 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1310 unsigned long long pos = inQual.tellg();
1311 qfileFilePos.push_back(pos - input.length() - 1);
1312 firstSeqNames.erase(it);
1317 if (firstSeqNames.size() == 0) { break; }
1322 if (firstSeqNames.size() != 0) {
1323 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1324 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1330 //get last file position of qfile
1332 unsigned long long size;
1334 //get num bytes in file
1335 pFile = fopen (qfilename.c_str(),"rb");
1336 if (pFile==NULL) perror ("Error opening file");
1338 fseek (pFile, 0, SEEK_END);
1343 qfileFilePos.push_back(size);
1346 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1347 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
1348 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1349 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1351 if(qfilename == "") { qLines = lines; } //files with duds
1357 if (processors == 1) { //save time
1358 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1359 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1360 lines.push_back(linePair(0, 1000));
1361 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1363 int numFastaSeqs = 0;
1364 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1365 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1367 if (qfilename != "") {
1368 int numQualSeqs = 0;
1369 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1371 if (numFastaSeqs != numQualSeqs) {
1372 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1376 //figure out how many sequences you have to process
1377 int numSeqsPerProcessor = numFastaSeqs / processors;
1378 for (int i = 0; i < processors; i++) {
1379 int startIndex = i * numSeqsPerProcessor;
1380 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1381 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1382 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1385 if(qfilename == "") { qLines = lines; } //files with duds
1390 catch(exception& e) {
1391 m->errorOut(e, "TrimSeqsCommand", "setLines");
1396 //***************************************************************************************************************
1398 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1401 m->openInputFile(oligoFile, inOligos);
1405 string type, oligo, group;
1407 int indexPrimer = 0;
1408 int indexBarcode = 0;
1410 while(!inOligos.eof()){
1414 if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
1417 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1418 m->gobble(inOligos);
1421 m->gobble(inOligos);
1422 //make type case insensitive
1423 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1427 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
1429 for(int i=0;i<oligo.length();i++){
1430 oligo[i] = toupper(oligo[i]);
1431 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1434 if(type == "FORWARD"){
1437 // get rest of line in case there is a primer name
1438 while (!inOligos.eof()) {
1439 char c = inOligos.get();
1440 if (c == 10 || c == 13){ break; }
1441 else if (c == 32 || c == 9){;} //space or tab
1442 else { group += c; }
1445 //check for repeat barcodes
1446 map<string, int>::iterator itPrime = primers.find(oligo);
1447 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1449 if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } }
1451 primers[oligo]=indexPrimer; indexPrimer++;
1452 primerNameVector.push_back(group);
1454 else if(type == "REVERSE"){
1455 //Sequence oligoRC("reverse", oligo);
1456 //oligoRC.reverseComplement();
1457 string oligoRC = reverseOligo(oligo);
1458 revPrimer.push_back(oligoRC);
1460 else if(type == "BARCODE"){
1463 //check for repeat barcodes
1464 map<string, int>::iterator itBar = barcodes.find(oligo);
1465 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1467 barcodes[oligo]=indexBarcode; indexBarcode++;
1468 barcodeNameVector.push_back(group);
1469 }else if(type == "LINKER"){
1470 linker.push_back(oligo);
1471 }else if(type == "SPACER"){
1472 spacer.push_back(oligo);
1474 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1476 m->gobble(inOligos);
1480 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1482 //add in potential combos
1483 if(barcodeNameVector.size() == 0){
1485 barcodeNameVector.push_back("");
1488 if(primerNameVector.size() == 0){
1490 primerNameVector.push_back("");
1493 fastaFileNames.resize(barcodeNameVector.size());
1494 for(int i=0;i<fastaFileNames.size();i++){
1495 fastaFileNames[i].assign(primerNameVector.size(), "");
1497 if(qFileName != "") { qualFileNames = fastaFileNames; }
1498 if(nameFile != "") { nameFileNames = fastaFileNames; }
1501 set<string> uniqueNames; //used to cleanup outputFileNames
1502 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1503 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1505 string primerName = primerNameVector[itPrimer->second];
1506 string barcodeName = barcodeNameVector[itBar->second];
1508 string comboGroupName = "";
1509 string fastaFileName = "";
1510 string qualFileName = "";
1511 string nameFileName = "";
1512 string countFileName = "";
1514 if(primerName == ""){
1515 comboGroupName = barcodeNameVector[itBar->second];
1518 if(barcodeName == ""){
1519 comboGroupName = primerNameVector[itPrimer->second];
1522 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1528 map<string, string> variables;
1529 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile));
1530 variables["[tag]"] = comboGroupName;
1531 fastaFileName = getOutputFileName("fasta", variables);
1532 if (uniqueNames.count(fastaFileName) == 0) {
1533 outputNames.push_back(fastaFileName);
1534 outputTypes["fasta"].push_back(fastaFileName);
1535 uniqueNames.insert(fastaFileName);
1538 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1539 m->openOutputFile(fastaFileName, temp); temp.close();
1541 if(qFileName != ""){
1542 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qFileName));
1543 qualFileName = getOutputFileName("qfile", variables);
1544 if (uniqueNames.count(qualFileName) == 0) {
1545 outputNames.push_back(qualFileName);
1546 outputTypes["qfile"].push_back(qualFileName);
1549 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1550 m->openOutputFile(qualFileName, temp); temp.close();
1554 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile));
1555 nameFileName = getOutputFileName("name", variables);
1556 if (uniqueNames.count(nameFileName) == 0) {
1557 outputNames.push_back(nameFileName);
1558 outputTypes["name"].push_back(nameFileName);
1561 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1562 m->openOutputFile(nameFileName, temp); temp.close();
1567 numFPrimers = primers.size();
1568 numRPrimers = revPrimer.size();
1569 numLinkers = linker.size();
1570 numSpacers = spacer.size();
1572 bool allBlank = true;
1573 for (int i = 0; i < barcodeNameVector.size(); i++) {
1574 if (barcodeNameVector[i] != "") {
1579 for (int i = 0; i < primerNameVector.size(); i++) {
1580 if (primerNameVector[i] != "") {
1587 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1595 catch(exception& e) {
1596 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1600 //***************************************************************************************************************
1602 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1605 if(qscores.getName() != ""){
1606 qscores.trimQScores(-1, keepFirst);
1608 sequence.trim(keepFirst);
1611 catch(exception& e) {
1612 m->errorOut(e, "keepFirstTrim", "countDiffs");
1618 //***************************************************************************************************************
1620 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1624 int length = sequence.getNumBases() - removeLast;
1627 if(qscores.getName() != ""){
1628 qscores.trimQScores(-1, length);
1630 sequence.trim(length);
1639 catch(exception& e) {
1640 m->errorOut(e, "removeLastTrim", "countDiffs");
1646 //***************************************************************************************************************
1648 bool TrimSeqsCommand::cullLength(Sequence& seq){
1651 int length = seq.getNumBases();
1652 bool success = 0; //guilty until proven innocent
1654 if(length >= minLength && maxLength == 0) { success = 1; }
1655 else if(length >= minLength && length <= maxLength) { success = 1; }
1656 else { success = 0; }
1661 catch(exception& e) {
1662 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1668 //***************************************************************************************************************
1670 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1672 int longHomoP = seq.getLongHomoPolymer();
1673 bool success = 0; //guilty until proven innocent
1675 if(longHomoP <= maxHomoP){ success = 1; }
1676 else { success = 0; }
1680 catch(exception& e) {
1681 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1686 //********************************************************************/
1687 string TrimSeqsCommand::reverseOligo(string oligo){
1689 string reverse = "";
1691 for(int i=oligo.length()-1;i>=0;i--){
1693 if(oligo[i] == 'A') { reverse += 'T'; }
1694 else if(oligo[i] == 'T'){ reverse += 'A'; }
1695 else if(oligo[i] == 'U'){ reverse += 'A'; }
1697 else if(oligo[i] == 'G'){ reverse += 'C'; }
1698 else if(oligo[i] == 'C'){ reverse += 'G'; }
1700 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1701 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1703 else if(oligo[i] == 'M'){ reverse += 'K'; }
1704 else if(oligo[i] == 'K'){ reverse += 'M'; }
1706 else if(oligo[i] == 'W'){ reverse += 'W'; }
1707 else if(oligo[i] == 'S'){ reverse += 'S'; }
1709 else if(oligo[i] == 'B'){ reverse += 'V'; }
1710 else if(oligo[i] == 'V'){ reverse += 'B'; }
1712 else if(oligo[i] == 'D'){ reverse += 'H'; }
1713 else if(oligo[i] == 'H'){ reverse += 'D'; }
1715 else { reverse += 'N'; }
1721 catch(exception& e) {
1722 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1727 //***************************************************************************************************************
1729 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1731 int numNs = seq.getAmbigBases();
1732 bool success = 0; //guilty until proven innocent
1734 if(numNs <= maxAmbig) { success = 1; }
1735 else { success = 0; }
1739 catch(exception& e) {
1740 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1745 //***************************************************************************************************************