5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
14 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
27 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
28 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
29 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
30 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
31 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
32 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
33 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
34 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
35 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
36 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
37 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
38 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
39 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
40 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
41 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
42 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
43 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
44 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
46 vector<string> myArray;
47 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
51 m->errorOut(e, "TrimSeqsCommand", "setParameters");
55 //**********************************************************************************************************************
56 string TrimSeqsCommand::getHelpString(){
58 string helpString = "";
59 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
60 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
61 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
62 helpString += "The fasta parameter is required.\n";
63 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
64 helpString += "The oligos parameter allows you to provide an oligos file.\n";
65 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
66 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
67 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
68 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
69 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
70 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
71 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
72 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
73 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
74 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
75 helpString += "The qfile parameter allows you to provide a quality file.\n";
76 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
77 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
78 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
79 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
80 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
81 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
82 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
83 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
84 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
85 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
86 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
87 helpString += "The trim.seqs command should be in the following format: \n";
88 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
89 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
90 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
91 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
92 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
96 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
100 //**********************************************************************************************************************
101 string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
103 string outputFileName = "";
104 map<string, vector<string> >::iterator it;
106 //is this a type this command creates
107 it = outputTypes.find(type);
108 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
110 if (type == "qfile") { outputFileName = "qual"; }
111 else if (type == "fasta") { outputFileName = "fasta"; }
112 else if (type == "group") { outputFileName = "groups"; }
113 else if (type == "name") { outputFileName = "names"; }
114 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
116 return outputFileName;
118 catch(exception& e) {
119 m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag");
125 //**********************************************************************************************************************
127 TrimSeqsCommand::TrimSeqsCommand(){
129 abort = true; calledHelp = true;
131 vector<string> tempOutNames;
132 outputTypes["fasta"] = tempOutNames;
133 outputTypes["qfile"] = tempOutNames;
134 outputTypes["group"] = tempOutNames;
135 outputTypes["name"] = tempOutNames;
137 catch(exception& e) {
138 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
142 //***************************************************************************************************************
144 TrimSeqsCommand::TrimSeqsCommand(string option) {
147 abort = false; calledHelp = false;
150 //allow user to run help
151 if(option == "help") { help(); abort = true; calledHelp = true; }
152 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
155 vector<string> myArray = setParameters();
157 OptionParser parser(option);
158 map<string,string> parameters = parser.getParameters();
160 ValidParameters validParameter;
161 map<string,string>::iterator it;
163 //check to make sure all parameters are valid for command
164 for (it = parameters.begin(); it != parameters.end(); it++) {
165 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
168 //initialize outputTypes
169 vector<string> tempOutNames;
170 outputTypes["fasta"] = tempOutNames;
171 outputTypes["qfile"] = tempOutNames;
172 outputTypes["group"] = tempOutNames;
173 outputTypes["name"] = tempOutNames;
175 //if the user changes the input directory command factory will send this info to us in the output parameter
176 string inputDir = validParameter.validFile(parameters, "inputdir", false);
177 if (inputDir == "not found"){ inputDir = ""; }
180 it = parameters.find("fasta");
181 //user has given a template file
182 if(it != parameters.end()){
183 path = m->hasPath(it->second);
184 //if the user has not given a path then, add inputdir. else leave path alone.
185 if (path == "") { parameters["fasta"] = inputDir + it->second; }
188 it = parameters.find("oligos");
189 //user has given a template file
190 if(it != parameters.end()){
191 path = m->hasPath(it->second);
192 //if the user has not given a path then, add inputdir. else leave path alone.
193 if (path == "") { parameters["oligos"] = inputDir + it->second; }
196 it = parameters.find("qfile");
197 //user has given a template file
198 if(it != parameters.end()){
199 path = m->hasPath(it->second);
200 //if the user has not given a path then, add inputdir. else leave path alone.
201 if (path == "") { parameters["qfile"] = inputDir + it->second; }
204 it = parameters.find("name");
205 //user has given a template file
206 if(it != parameters.end()){
207 path = m->hasPath(it->second);
208 //if the user has not given a path then, add inputdir. else leave path alone.
209 if (path == "") { parameters["name"] = inputDir + it->second; }
215 //check for required parameters
216 fastaFile = validParameter.validFile(parameters, "fasta", true);
217 if (fastaFile == "not found") {
218 fastaFile = m->getFastaFile();
219 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
220 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
221 }else if (fastaFile == "not open") { abort = true; }
222 else { m->setFastaFile(fastaFile); }
224 //if the user changes the output directory command factory will send this info to us in the output parameter
225 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
227 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
231 //check for optional parameter and set defaults
232 // ...at some point should added some additional type checking...
234 temp = validParameter.validFile(parameters, "flip", false);
235 if (temp == "not found") { flip = 0; }
236 else { flip = m->isTrue(temp); }
238 temp = validParameter.validFile(parameters, "oligos", true);
239 if (temp == "not found"){ oligoFile = ""; }
240 else if(temp == "not open"){ abort = true; }
241 else { oligoFile = temp; m->setOligosFile(oligoFile); }
244 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
245 m->mothurConvert(temp, maxAmbig);
247 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
248 m->mothurConvert(temp, maxHomoP);
250 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
251 m->mothurConvert(temp, minLength);
253 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
254 m->mothurConvert(temp, maxLength);
256 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
257 m->mothurConvert(temp, bdiffs);
259 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
260 m->mothurConvert(temp, pdiffs);
262 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
263 m->mothurConvert(temp, ldiffs);
265 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
266 m->mothurConvert(temp, sdiffs);
268 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
269 m->mothurConvert(temp, tdiffs);
271 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
273 temp = validParameter.validFile(parameters, "qfile", true);
274 if (temp == "not found") { qFileName = ""; }
275 else if(temp == "not open") { abort = true; }
276 else { qFileName = temp; m->setQualFile(qFileName); }
278 temp = validParameter.validFile(parameters, "name", true);
279 if (temp == "not found") { nameFile = ""; }
280 else if(temp == "not open") { nameFile = ""; abort = true; }
281 else { nameFile = temp; m->setNameFile(nameFile); }
283 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
284 m->mothurConvert(temp, qThreshold);
286 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
287 qtrim = m->isTrue(temp);
289 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
290 convert(temp, qRollAverage);
292 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
293 convert(temp, qWindowAverage);
295 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
296 convert(temp, qWindowSize);
298 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
299 convert(temp, qWindowStep);
301 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
302 convert(temp, qAverage);
304 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
305 convert(temp, keepFirst);
307 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
308 convert(temp, removeLast);
310 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
311 allFiles = m->isTrue(temp);
313 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
314 keepforward = m->isTrue(temp);
316 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
317 m->setProcessors(temp);
318 m->mothurConvert(temp, processors);
321 if(allFiles && (oligoFile == "")){
322 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
324 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
325 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
329 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
330 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
334 if (nameFile == "") {
335 vector<string> files; files.push_back(fastaFile);
336 parser.getNameFile(files);
341 catch(exception& e) {
342 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
346 //***************************************************************************************************************
348 int TrimSeqsCommand::execute(){
351 if (abort == true) { if (calledHelp) { return 0; } return 2; }
353 numFPrimers = 0; //this needs to be initialized
358 vector<vector<string> > fastaFileNames;
359 vector<vector<string> > qualFileNames;
360 vector<vector<string> > nameFileNames;
362 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta");
363 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
365 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta");
366 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
368 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile");
369 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile");
371 if (qFileName != "") {
372 outputNames.push_back(trimQualFile);
373 outputNames.push_back(scrapQualFile);
374 outputTypes["qfile"].push_back(trimQualFile);
375 outputTypes["qfile"].push_back(scrapQualFile);
378 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name");
379 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name");
381 if (nameFile != "") {
382 m->readNames(nameFile, nameMap);
383 outputNames.push_back(trimNameFile);
384 outputNames.push_back(scrapNameFile);
385 outputTypes["name"].push_back(trimNameFile);
386 outputTypes["name"].push_back(scrapNameFile);
389 if (m->control_pressed) { return 0; }
391 string outputGroupFileName;
393 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
395 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group");
396 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
400 //fills lines and qlines
401 setLines(fastaFile, qFileName);
404 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
406 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
410 if (m->control_pressed) { return 0; }
413 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
414 map<string, string>::iterator it;
415 set<string> namesToRemove;
416 for(int i=0;i<fastaFileNames.size();i++){
417 for(int j=0;j<fastaFileNames[0].size();j++){
418 if (fastaFileNames[i][j] != "") {
419 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
420 if(m->isBlank(fastaFileNames[i][j])){
421 m->mothurRemove(fastaFileNames[i][j]);
422 namesToRemove.insert(fastaFileNames[i][j]);
425 m->mothurRemove(qualFileNames[i][j]);
426 namesToRemove.insert(qualFileNames[i][j]);
430 m->mothurRemove(nameFileNames[i][j]);
431 namesToRemove.insert(nameFileNames[i][j]);
434 it = uniqueFastaNames.find(fastaFileNames[i][j]);
435 if (it == uniqueFastaNames.end()) {
436 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
444 //remove names for outputFileNames, just cleans up the output
445 vector<string> outputNames2;
446 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
447 outputNames = outputNames2;
449 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
451 m->openInputFile(it->first, in);
454 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group");
455 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
456 m->openOutputFile(thisGroupName, out);
459 if (m->control_pressed) { break; }
461 Sequence currSeq(in); m->gobble(in);
462 out << currSeq.getName() << '\t' << it->second << endl;
464 if (nameFile != "") {
465 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
466 if (itName != nameMap.end()) {
467 vector<string> thisSeqsNames;
468 m->splitAtChar(itName->second, thisSeqsNames, ',');
469 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
470 out << thisSeqsNames[k] << '\t' << it->second << endl;
472 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
480 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
482 //output group counts
483 m->mothurOutEndLine();
485 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
486 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
487 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
489 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
491 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
493 //set fasta file as new current fastafile
495 itTypes = outputTypes.find("fasta");
496 if (itTypes != outputTypes.end()) {
497 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
500 itTypes = outputTypes.find("name");
501 if (itTypes != outputTypes.end()) {
502 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
505 itTypes = outputTypes.find("qfile");
506 if (itTypes != outputTypes.end()) {
507 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
510 itTypes = outputTypes.find("group");
511 if (itTypes != outputTypes.end()) {
512 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
515 m->mothurOutEndLine();
516 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
517 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
518 m->mothurOutEndLine();
523 catch(exception& e) {
524 m->errorOut(e, "TrimSeqsCommand", "execute");
529 /**************************************************************************************/
531 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
535 ofstream trimFASTAFile;
536 m->openOutputFile(trimFileName, trimFASTAFile);
538 ofstream scrapFASTAFile;
539 m->openOutputFile(scrapFileName, scrapFASTAFile);
541 ofstream trimQualFile;
542 ofstream scrapQualFile;
544 m->openOutputFile(trimQFileName, trimQualFile);
545 m->openOutputFile(scrapQFileName, scrapQualFile);
548 ofstream trimNameFile;
549 ofstream scrapNameFile;
551 m->openOutputFile(trimNFileName, trimNameFile);
552 m->openOutputFile(scrapNFileName, scrapNameFile);
556 ofstream outGroupsFile;
557 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
559 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
560 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
561 if (fastaFileNames[i][j] != "") {
563 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
565 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
569 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
577 m->openInputFile(filename, inFASTA);
578 inFASTA.seekg(line.start);
581 if(qFileName != "") {
582 m->openInputFile(qFileName, qFile);
583 qFile.seekg(qline.start);
588 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
592 if (m->control_pressed) {
593 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
594 if (createGroup) { outGroupsFile.close(); }
599 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
605 string trashCode = "";
606 int currentSeqsDiffs = 0;
608 Sequence currSeq(inFASTA); m->gobble(inFASTA);
609 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
611 QualityScores currQual;
613 currQual = QualityScores(qFile); m->gobble(qFile);
614 //cout << currQual.getName() << endl;
617 string origSeq = currSeq.getUnaligned();
620 int barcodeIndex = 0;
624 success = trimOligos.stripLinker(currSeq, currQual);
625 if(success > ldiffs) { trashCode += 'k'; }
626 else{ currentSeqsDiffs += success; }
630 if(barcodes.size() != 0){
631 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
632 if(success > bdiffs) { trashCode += 'b'; }
633 else{ currentSeqsDiffs += success; }
636 if(rbarcodes.size() != 0){
637 success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
638 if(success > bdiffs) { trashCode += 'b'; }
639 else{ currentSeqsDiffs += success; }
643 success = trimOligos.stripSpacer(currSeq, currQual);
644 if(success > sdiffs) { trashCode += 's'; }
645 else{ currentSeqsDiffs += success; }
649 if(numFPrimers != 0){
650 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
651 if(success > pdiffs) { trashCode += 'f'; }
652 else{ currentSeqsDiffs += success; }
655 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
657 if(numRPrimers != 0){
658 success = trimOligos.stripReverse(currSeq, currQual);
659 if(!success) { trashCode += 'r'; }
663 success = keepFirstTrim(currSeq, currQual);
667 success = removeLastTrim(currSeq, currQual);
668 if(!success) { trashCode += 'l'; }
673 int origLength = currSeq.getNumBases();
675 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
676 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
677 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
678 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
679 else { success = 1; }
681 //you don't want to trim, if it fails above then scrap it
682 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
684 if(!success) { trashCode += 'q'; }
687 if(minLength > 0 || maxLength > 0){
688 success = cullLength(currSeq);
689 if(!success) { trashCode += 'l'; }
692 success = cullHomoP(currSeq);
693 if(!success) { trashCode += 'h'; }
696 success = cullAmbigs(currSeq);
697 if(!success) { trashCode += 'n'; }
700 if(flip){ // should go last
701 currSeq.reverseComplement();
703 currQual.flipQScores();
707 if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
709 if(trashCode.length() == 0){
710 currSeq.setAligned(currSeq.getUnaligned());
711 currSeq.printSequence(trimFASTAFile);
714 currQual.printQScores(trimQualFile);
719 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
720 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
721 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
725 if(barcodes.size() != 0){
726 string thisGroup = barcodeNameVector[barcodeIndex];
727 if (primers.size() != 0) {
728 if (primerNameVector[primerIndex] != "") {
729 if(thisGroup != "") {
730 thisGroup += "." + primerNameVector[primerIndex];
732 thisGroup = primerNameVector[primerIndex];
737 if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
739 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
741 int numRedundants = 0;
742 if (nameFile != "") {
743 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
744 if (itName != nameMap.end()) {
745 vector<string> thisSeqsNames;
746 m->splitAtChar(itName->second, thisSeqsNames, ',');
747 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
748 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
749 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
751 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
754 map<string, int>::iterator it = groupCounts.find(thisGroup);
755 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
756 else { groupCounts[it->first] += (1 + numRedundants); }
763 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
764 currSeq.printSequence(output);
768 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
769 currQual.printQScores(output);
774 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
775 if (itName != nameMap.end()) {
776 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
777 output << itName->first << '\t' << itName->second << endl;
779 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
784 if(nameFile != ""){ //needs to be before the currSeq name is changed
785 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
786 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
787 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
789 currSeq.setName(currSeq.getName() + '|' + trashCode);
790 currSeq.setUnaligned(origSeq);
791 currSeq.setAligned(origSeq);
792 currSeq.printSequence(scrapFASTAFile);
794 currQual.printQScores(scrapQualFile);
800 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
801 unsigned long long pos = inFASTA.tellg();
802 if ((pos == -1) || (pos >= line.end)) { break; }
805 if (inFASTA.eof()) { break; }
809 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
813 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
817 trimFASTAFile.close();
818 scrapFASTAFile.close();
819 if (createGroup) { outGroupsFile.close(); }
820 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
821 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
825 catch(exception& e) {
826 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
831 /**************************************************************************************************/
833 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
840 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
841 //loop through and create all the processes you want
842 while (process != processors) {
846 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
850 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
851 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
852 vector<vector<string> > tempNameFileNames = nameFileNames;
857 for(int i=0;i<tempFASTAFileNames.size();i++){
858 for(int j=0;j<tempFASTAFileNames[i].size();j++){
859 if (tempFASTAFileNames[i][j] != "") {
860 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
861 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
864 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
865 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
868 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
869 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
876 driverCreateTrim(filename,
878 (trimFASTAFileName + toString(getpid()) + ".temp"),
879 (scrapFASTAFileName + toString(getpid()) + ".temp"),
880 (trimQualFileName + toString(getpid()) + ".temp"),
881 (scrapQualFileName + toString(getpid()) + ".temp"),
882 (trimNameFileName + toString(getpid()) + ".temp"),
883 (scrapNameFileName + toString(getpid()) + ".temp"),
884 (groupFile + toString(getpid()) + ".temp"),
886 tempPrimerQualFileNames,
891 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
893 //pass groupCounts to parent
896 string tempFile = filename + toString(getpid()) + ".num.temp";
897 m->openOutputFile(tempFile, out);
899 out << groupCounts.size() << endl;
901 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
902 out << it->first << '\t' << it->second << endl;
908 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
909 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
916 m->openOutputFile(trimFASTAFileName, temp); temp.close();
917 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
919 m->openOutputFile(trimQualFileName, temp); temp.close();
920 m->openOutputFile(scrapQualFileName, temp); temp.close();
922 if (nameFile != "") {
923 m->openOutputFile(trimNameFileName, temp); temp.close();
924 m->openOutputFile(scrapNameFileName, temp); temp.close();
927 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
929 //force parent to wait until all the processes are done
930 for (int i=0;i<processIDS.size();i++) {
931 int temp = processIDS[i];
935 //////////////////////////////////////////////////////////////////////////////////////////////////////
936 //Windows version shared memory, so be careful when passing variables through the trimData struct.
937 //Above fork() will clone, so memory is separate, but that's not the case with windows,
938 //////////////////////////////////////////////////////////////////////////////////////////////////////
940 vector<trimData*> pDataArray;
941 DWORD dwThreadIdArray[processors-1];
942 HANDLE hThreadArray[processors-1];
944 //Create processor worker threads.
945 for( int i=0; i<processors-1; i++){
947 string extension = "";
948 if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
949 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
950 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
951 vector<vector<string> > tempNameFileNames = nameFileNames;
956 for(int i=0;i<tempFASTAFileNames.size();i++){
957 for(int j=0;j<tempFASTAFileNames[i].size();j++){
958 if (tempFASTAFileNames[i][j] != "") {
959 tempFASTAFileNames[i][j] += extension;
960 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
963 tempPrimerQualFileNames[i][j] += extension;
964 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
967 tempNameFileNames[i][j] += extension;
968 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
976 trimData* tempTrim = new trimData(filename,
978 (trimFASTAFileName+extension),
979 (scrapFASTAFileName+extension),
980 (trimQualFileName+extension),
981 (scrapQualFileName+extension),
982 (trimNameFileName+extension),
983 (scrapNameFileName+extension),
984 (groupFile+extension),
986 tempPrimerQualFileNames,
988 lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
989 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
990 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
991 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
992 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
993 pDataArray.push_back(tempTrim);
995 hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1000 m->openOutputFile(trimFASTAFileName, temp); temp.close();
1001 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
1002 if(qFileName != ""){
1003 m->openOutputFile(trimQualFileName, temp); temp.close();
1004 m->openOutputFile(scrapQualFileName, temp); temp.close();
1006 if (nameFile != "") {
1007 m->openOutputFile(trimNameFileName, temp); temp.close();
1008 m->openOutputFile(scrapNameFileName, temp); temp.close();
1011 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
1012 processIDS.push_back(processors-1);
1015 //Wait until all threads have terminated.
1016 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1018 //Close all thread handles and free memory allocations.
1019 for(int i=0; i < pDataArray.size(); i++){
1020 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
1021 map<string, int>::iterator it2 = groupCounts.find(it->first);
1022 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
1023 else { groupCounts[it->first] += it->second; }
1025 CloseHandle(hThreadArray[i]);
1026 delete pDataArray[i];
1033 for(int i=0;i<processIDS.size();i++){
1035 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1037 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1038 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1039 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1040 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1042 if(qFileName != ""){
1043 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1044 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1045 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1046 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1050 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1051 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1052 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1053 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1057 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1058 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1063 for(int j=0;j<fastaFileNames.size();j++){
1064 for(int k=0;k<fastaFileNames[j].size();k++){
1065 if (fastaFileNames[j][k] != "") {
1066 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1067 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1069 if(qFileName != ""){
1070 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1071 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1075 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1076 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1083 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1086 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1087 m->openInputFile(tempFile, in);
1091 in >> tempNum; m->gobble(in);
1095 in >> group >> tempNum; m->gobble(in);
1097 map<string, int>::iterator it = groupCounts.find(group);
1098 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
1099 else { groupCounts[it->first] += tempNum; }
1102 in.close(); m->mothurRemove(tempFile);
1109 catch(exception& e) {
1110 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1115 /**************************************************************************************************/
1117 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1120 vector<unsigned long long> fastaFilePos;
1121 vector<unsigned long long> qfileFilePos;
1123 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1124 //set file positions for fasta file
1125 fastaFilePos = m->divideFile(filename, processors);
1127 //get name of first sequence in each chunk
1128 map<string, int> firstSeqNames;
1129 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1131 m->openInputFile(filename, in);
1132 in.seekg(fastaFilePos[i]);
1135 firstSeqNames[temp.getName()] = i;
1140 if(qfilename != "") {
1141 //seach for filePos of each first name in the qfile and save in qfileFilePos
1143 m->openInputFile(qfilename, inQual);
1146 while(!inQual.eof()){
1147 input = m->getline(inQual);
1149 if (input.length() != 0) {
1150 if(input[0] == '>'){ //this is a sequence name line
1151 istringstream nameStream(input);
1153 string sname = ""; nameStream >> sname;
1154 sname = sname.substr(1);
1156 map<string, int>::iterator it = firstSeqNames.find(sname);
1158 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1159 unsigned long long pos = inQual.tellg();
1160 qfileFilePos.push_back(pos - input.length() - 1);
1161 firstSeqNames.erase(it);
1166 if (firstSeqNames.size() == 0) { break; }
1171 if (firstSeqNames.size() != 0) {
1172 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1173 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1179 //get last file position of qfile
1181 unsigned long long size;
1183 //get num bytes in file
1184 pFile = fopen (qfilename.c_str(),"rb");
1185 if (pFile==NULL) perror ("Error opening file");
1187 fseek (pFile, 0, SEEK_END);
1192 qfileFilePos.push_back(size);
1195 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1196 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
1197 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1198 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1200 if(qfilename == "") { qLines = lines; } //files with duds
1206 if (processors == 1) { //save time
1207 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1208 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1209 lines.push_back(linePair(0, 1000));
1210 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1212 int numFastaSeqs = 0;
1213 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1214 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1216 if (qfilename != "") {
1217 int numQualSeqs = 0;
1218 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1220 if (numFastaSeqs != numQualSeqs) {
1221 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1225 //figure out how many sequences you have to process
1226 int numSeqsPerProcessor = numFastaSeqs / processors;
1227 for (int i = 0; i < processors; i++) {
1228 int startIndex = i * numSeqsPerProcessor;
1229 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1230 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1231 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1234 if(qfilename == "") { qLines = lines; } //files with duds
1239 catch(exception& e) {
1240 m->errorOut(e, "TrimSeqsCommand", "setLines");
1245 //***************************************************************************************************************
1247 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1250 m->openInputFile(oligoFile, inOligos);
1254 string type, oligo, group;
1256 int indexPrimer = 0;
1257 int indexBarcode = 0;
1259 while(!inOligos.eof()){
1263 if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }
1266 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1267 m->gobble(inOligos);
1270 m->gobble(inOligos);
1271 //make type case insensitive
1272 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1276 if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
1278 for(int i=0;i<oligo.length();i++){
1279 oligo[i] = toupper(oligo[i]);
1280 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1283 if(type == "FORWARD"){
1286 // get rest of line in case there is a primer name
1287 while (!inOligos.eof()) {
1288 char c = inOligos.get();
1289 if (c == 10 || c == 13){ break; }
1290 else if (c == 32 || c == 9){;} //space or tab
1291 else { group += c; }
1294 //check for repeat barcodes
1295 map<string, int>::iterator itPrime = primers.find(oligo);
1296 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1298 if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } }
1300 primers[oligo]=indexPrimer; indexPrimer++;
1301 primerNameVector.push_back(group);
1303 else if(type == "REVERSE"){
1304 //Sequence oligoRC("reverse", oligo);
1305 //oligoRC.reverseComplement();
1306 string oligoRC = reverseOligo(oligo);
1307 revPrimer.push_back(oligoRC);
1309 else if(type == "BARCODE"){
1312 //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
1313 //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
1315 while (!inOligos.eof()) {
1316 char c = inOligos.get();
1317 if (c == 10 || c == 13){ break; }
1318 else if (c == 32 || c == 9){;} //space or tab
1322 //then this is illumina data with 4 columns
1325 for(int i=0;i<group.length();i++){
1326 group[i] = toupper(group[i]);
1327 if(group[i] == 'U') { group[i] = 'T'; }
1330 if (m->debug) { m->mothurOut("[DEBUG]: reading reverse " + group + ", and group = " + temp + ".\n"); }
1332 string reverseBarcode = reverseOligo(group); //reverse barcode
1333 //check for repeat barcodes
1334 map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
1335 if (itBar != rbarcodes.end()) { m->mothurOut("reverse barcode " + group + " is in your oligos file already."); m->mothurOutEndLine(); }
1338 rbarcodes[reverseBarcode]=indexBarcode;
1339 }else { if (m->debug) { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); } }
1341 //check for repeat barcodes
1342 map<string, int>::iterator itBar = barcodes.find(oligo);
1343 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1345 barcodes[oligo]=indexBarcode; indexBarcode++;
1346 barcodeNameVector.push_back(group);
1347 }else if(type == "LINKER"){
1348 linker.push_back(oligo);
1349 }else if(type == "SPACER"){
1350 spacer.push_back(oligo);
1352 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1354 m->gobble(inOligos);
1358 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1360 //add in potential combos
1361 if(barcodeNameVector.size() == 0){
1363 barcodeNameVector.push_back("");
1366 if(primerNameVector.size() == 0){
1368 primerNameVector.push_back("");
1371 fastaFileNames.resize(barcodeNameVector.size());
1372 for(int i=0;i<fastaFileNames.size();i++){
1373 fastaFileNames[i].assign(primerNameVector.size(), "");
1375 if(qFileName != "") { qualFileNames = fastaFileNames; }
1376 if(nameFile != "") { nameFileNames = fastaFileNames; }
1379 set<string> uniqueNames; //used to cleanup outputFileNames
1380 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1381 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1383 string primerName = primerNameVector[itPrimer->second];
1384 string barcodeName = barcodeNameVector[itBar->second];
1386 string comboGroupName = "";
1387 string fastaFileName = "";
1388 string qualFileName = "";
1389 string nameFileName = "";
1391 if(primerName == ""){
1392 comboGroupName = barcodeNameVector[itBar->second];
1395 if(barcodeName == ""){
1396 comboGroupName = primerNameVector[itPrimer->second];
1399 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1405 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1406 if (uniqueNames.count(fastaFileName) == 0) {
1407 outputNames.push_back(fastaFileName);
1408 outputTypes["fasta"].push_back(fastaFileName);
1409 uniqueNames.insert(fastaFileName);
1412 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1413 m->openOutputFile(fastaFileName, temp); temp.close();
1415 if(qFileName != ""){
1416 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1417 if (uniqueNames.count(qualFileName) == 0) {
1418 outputNames.push_back(qualFileName);
1419 outputTypes["qfile"].push_back(qualFileName);
1422 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1423 m->openOutputFile(qualFileName, temp); temp.close();
1427 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1428 if (uniqueNames.count(nameFileName) == 0) {
1429 outputNames.push_back(nameFileName);
1430 outputTypes["name"].push_back(nameFileName);
1433 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1434 m->openOutputFile(nameFileName, temp); temp.close();
1440 numFPrimers = primers.size();
1441 numRPrimers = revPrimer.size();
1442 numLinkers = linker.size();
1443 numSpacers = spacer.size();
1445 bool allBlank = true;
1446 for (int i = 0; i < barcodeNameVector.size(); i++) {
1447 if (barcodeNameVector[i] != "") {
1452 for (int i = 0; i < primerNameVector.size(); i++) {
1453 if (primerNameVector[i] != "") {
1460 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1468 catch(exception& e) {
1469 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1473 //***************************************************************************************************************
1475 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1478 if(qscores.getName() != ""){
1479 qscores.trimQScores(-1, keepFirst);
1481 sequence.trim(keepFirst);
1484 catch(exception& e) {
1485 m->errorOut(e, "keepFirstTrim", "countDiffs");
1491 //***************************************************************************************************************
1493 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1497 int length = sequence.getNumBases() - removeLast;
1500 if(qscores.getName() != ""){
1501 qscores.trimQScores(-1, length);
1503 sequence.trim(length);
1512 catch(exception& e) {
1513 m->errorOut(e, "removeLastTrim", "countDiffs");
1519 //***************************************************************************************************************
1521 bool TrimSeqsCommand::cullLength(Sequence& seq){
1524 int length = seq.getNumBases();
1525 bool success = 0; //guilty until proven innocent
1527 if(length >= minLength && maxLength == 0) { success = 1; }
1528 else if(length >= minLength && length <= maxLength) { success = 1; }
1529 else { success = 0; }
1534 catch(exception& e) {
1535 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1541 //***************************************************************************************************************
1543 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1545 int longHomoP = seq.getLongHomoPolymer();
1546 bool success = 0; //guilty until proven innocent
1548 if(longHomoP <= maxHomoP){ success = 1; }
1549 else { success = 0; }
1553 catch(exception& e) {
1554 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1559 //********************************************************************/
1560 string TrimSeqsCommand::reverseOligo(string oligo){
1562 string reverse = "";
1564 for(int i=oligo.length()-1;i>=0;i--){
1566 if(oligo[i] == 'A') { reverse += 'T'; }
1567 else if(oligo[i] == 'T'){ reverse += 'A'; }
1568 else if(oligo[i] == 'U'){ reverse += 'A'; }
1570 else if(oligo[i] == 'G'){ reverse += 'C'; }
1571 else if(oligo[i] == 'C'){ reverse += 'G'; }
1573 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1574 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1576 else if(oligo[i] == 'M'){ reverse += 'K'; }
1577 else if(oligo[i] == 'K'){ reverse += 'M'; }
1579 else if(oligo[i] == 'W'){ reverse += 'W'; }
1580 else if(oligo[i] == 'S'){ reverse += 'S'; }
1582 else if(oligo[i] == 'B'){ reverse += 'V'; }
1583 else if(oligo[i] == 'V'){ reverse += 'B'; }
1585 else if(oligo[i] == 'D'){ reverse += 'H'; }
1586 else if(oligo[i] == 'H'){ reverse += 'D'; }
1588 else { reverse += 'N'; }
1594 catch(exception& e) {
1595 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1600 //***************************************************************************************************************
1602 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1604 int numNs = seq.getAmbigBases();
1605 bool success = 0; //guilty until proven innocent
1607 if(numNs <= maxAmbig) { success = 1; }
1608 else { success = 0; }
1612 catch(exception& e) {
1613 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1618 //***************************************************************************************************************