5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
14 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
27 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
28 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
29 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
30 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
31 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
32 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
33 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
34 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
35 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
36 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
37 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
38 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
39 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
40 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
41 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
42 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
43 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
44 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
46 vector<string> myArray;
47 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
51 m->errorOut(e, "TrimSeqsCommand", "setParameters");
55 //**********************************************************************************************************************
56 string TrimSeqsCommand::getHelpString(){
58 string helpString = "";
59 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
60 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
61 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
62 helpString += "The fasta parameter is required.\n";
63 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
64 helpString += "The oligos parameter allows you to provide an oligos file.\n";
65 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
66 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
67 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
68 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
69 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
70 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
71 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
72 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
73 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
74 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
75 helpString += "The qfile parameter allows you to provide a quality file.\n";
76 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
77 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
78 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
79 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
80 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
81 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
82 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
83 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
84 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
85 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
86 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
87 helpString += "The trim.seqs command should be in the following format: \n";
88 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
89 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
90 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
91 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
92 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
96 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
100 //**********************************************************************************************************************
101 string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
103 string outputFileName = "";
104 map<string, vector<string> >::iterator it;
106 //is this a type this command creates
107 it = outputTypes.find(type);
108 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
110 if (type == "qfile") { outputFileName = "qual"; }
111 else if (type == "fasta") { outputFileName = "fasta"; }
112 else if (type == "group") { outputFileName = "groups"; }
113 else if (type == "name") { outputFileName = "names"; }
114 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
116 return outputFileName;
118 catch(exception& e) {
119 m->errorOut(e, "TrimSeqsCommand", "getOutputFileNameTag");
125 //**********************************************************************************************************************
127 TrimSeqsCommand::TrimSeqsCommand(){
129 abort = true; calledHelp = true;
131 vector<string> tempOutNames;
132 outputTypes["fasta"] = tempOutNames;
133 outputTypes["qfile"] = tempOutNames;
134 outputTypes["group"] = tempOutNames;
135 outputTypes["name"] = tempOutNames;
137 catch(exception& e) {
138 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
142 //***************************************************************************************************************
144 TrimSeqsCommand::TrimSeqsCommand(string option) {
147 abort = false; calledHelp = false;
150 //allow user to run help
151 if(option == "help") { help(); abort = true; calledHelp = true; }
152 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
155 vector<string> myArray = setParameters();
157 OptionParser parser(option);
158 map<string,string> parameters = parser.getParameters();
160 ValidParameters validParameter;
161 map<string,string>::iterator it;
163 //check to make sure all parameters are valid for command
164 for (it = parameters.begin(); it != parameters.end(); it++) {
165 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
168 //initialize outputTypes
169 vector<string> tempOutNames;
170 outputTypes["fasta"] = tempOutNames;
171 outputTypes["qfile"] = tempOutNames;
172 outputTypes["group"] = tempOutNames;
173 outputTypes["name"] = tempOutNames;
175 //if the user changes the input directory command factory will send this info to us in the output parameter
176 string inputDir = validParameter.validFile(parameters, "inputdir", false);
177 if (inputDir == "not found"){ inputDir = ""; }
180 it = parameters.find("fasta");
181 //user has given a template file
182 if(it != parameters.end()){
183 path = m->hasPath(it->second);
184 //if the user has not given a path then, add inputdir. else leave path alone.
185 if (path == "") { parameters["fasta"] = inputDir + it->second; }
188 it = parameters.find("oligos");
189 //user has given a template file
190 if(it != parameters.end()){
191 path = m->hasPath(it->second);
192 //if the user has not given a path then, add inputdir. else leave path alone.
193 if (path == "") { parameters["oligos"] = inputDir + it->second; }
196 it = parameters.find("qfile");
197 //user has given a template file
198 if(it != parameters.end()){
199 path = m->hasPath(it->second);
200 //if the user has not given a path then, add inputdir. else leave path alone.
201 if (path == "") { parameters["qfile"] = inputDir + it->second; }
204 it = parameters.find("name");
205 //user has given a template file
206 if(it != parameters.end()){
207 path = m->hasPath(it->second);
208 //if the user has not given a path then, add inputdir. else leave path alone.
209 if (path == "") { parameters["name"] = inputDir + it->second; }
215 //check for required parameters
216 fastaFile = validParameter.validFile(parameters, "fasta", true);
217 if (fastaFile == "not found") {
218 fastaFile = m->getFastaFile();
219 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
220 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
221 }else if (fastaFile == "not open") { abort = true; }
222 else { m->setFastaFile(fastaFile); }
224 //if the user changes the output directory command factory will send this info to us in the output parameter
225 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
227 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
231 //check for optional parameter and set defaults
232 // ...at some point should added some additional type checking...
234 temp = validParameter.validFile(parameters, "flip", false);
235 if (temp == "not found") { flip = 0; }
236 else { flip = m->isTrue(temp); }
238 temp = validParameter.validFile(parameters, "oligos", true);
239 if (temp == "not found"){ oligoFile = ""; }
240 else if(temp == "not open"){ abort = true; }
241 else { oligoFile = temp; m->setOligosFile(oligoFile); }
244 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
245 m->mothurConvert(temp, maxAmbig);
247 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
248 m->mothurConvert(temp, maxHomoP);
250 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
251 m->mothurConvert(temp, minLength);
253 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
254 m->mothurConvert(temp, maxLength);
256 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
257 m->mothurConvert(temp, bdiffs);
259 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
260 m->mothurConvert(temp, pdiffs);
262 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
263 m->mothurConvert(temp, ldiffs);
265 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
266 m->mothurConvert(temp, sdiffs);
268 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
269 m->mothurConvert(temp, tdiffs);
271 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
273 temp = validParameter.validFile(parameters, "qfile", true);
274 if (temp == "not found") { qFileName = ""; }
275 else if(temp == "not open") { abort = true; }
276 else { qFileName = temp; m->setQualFile(qFileName); }
278 temp = validParameter.validFile(parameters, "name", true);
279 if (temp == "not found") { nameFile = ""; }
280 else if(temp == "not open") { nameFile = ""; abort = true; }
281 else { nameFile = temp; m->setNameFile(nameFile); }
283 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
284 m->mothurConvert(temp, qThreshold);
286 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
287 qtrim = m->isTrue(temp);
289 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
290 convert(temp, qRollAverage);
292 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
293 convert(temp, qWindowAverage);
295 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
296 convert(temp, qWindowSize);
298 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
299 convert(temp, qWindowStep);
301 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
302 convert(temp, qAverage);
304 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
305 convert(temp, keepFirst);
307 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
308 convert(temp, removeLast);
310 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
311 allFiles = m->isTrue(temp);
313 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
314 keepforward = m->isTrue(temp);
316 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
317 m->setProcessors(temp);
318 m->mothurConvert(temp, processors);
321 if(allFiles && (oligoFile == "")){
322 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
324 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
325 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
329 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
330 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
334 if (nameFile == "") {
335 vector<string> files; files.push_back(fastaFile);
336 parser.getNameFile(files);
341 catch(exception& e) {
342 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
346 //***************************************************************************************************************
348 int TrimSeqsCommand::execute(){
351 if (abort == true) { if (calledHelp) { return 0; } return 2; }
353 numFPrimers = 0; //this needs to be initialized
358 vector<vector<string> > fastaFileNames;
359 vector<vector<string> > qualFileNames;
360 vector<vector<string> > nameFileNames;
362 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("fasta");
363 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
365 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("fasta");
366 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
368 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim." + getOutputFileNameTag("qfile");
369 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap." + getOutputFileNameTag("qfile");
371 if (qFileName != "") {
372 outputNames.push_back(trimQualFile);
373 outputNames.push_back(scrapQualFile);
374 outputTypes["qfile"].push_back(trimQualFile);
375 outputTypes["qfile"].push_back(scrapQualFile);
378 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim." + getOutputFileNameTag("name");
379 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap." + getOutputFileNameTag("name");
381 if (nameFile != "") {
382 m->readNames(nameFile, nameMap);
383 outputNames.push_back(trimNameFile);
384 outputNames.push_back(scrapNameFile);
385 outputTypes["name"].push_back(trimNameFile);
386 outputTypes["name"].push_back(scrapNameFile);
389 if (m->control_pressed) { return 0; }
391 string outputGroupFileName;
393 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
395 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group");
396 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
400 //fills lines and qlines
401 setLines(fastaFile, qFileName);
404 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
406 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
410 if (m->control_pressed) { return 0; }
413 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
414 map<string, string>::iterator it;
415 set<string> namesToRemove;
416 for(int i=0;i<fastaFileNames.size();i++){
417 for(int j=0;j<fastaFileNames[0].size();j++){
418 if (fastaFileNames[i][j] != "") {
419 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
420 if(m->isBlank(fastaFileNames[i][j])){
421 m->mothurRemove(fastaFileNames[i][j]);
422 namesToRemove.insert(fastaFileNames[i][j]);
425 m->mothurRemove(qualFileNames[i][j]);
426 namesToRemove.insert(qualFileNames[i][j]);
430 m->mothurRemove(nameFileNames[i][j]);
431 namesToRemove.insert(nameFileNames[i][j]);
434 it = uniqueFastaNames.find(fastaFileNames[i][j]);
435 if (it == uniqueFastaNames.end()) {
436 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
444 //remove names for outputFileNames, just cleans up the output
445 vector<string> outputNames2;
446 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
447 outputNames = outputNames2;
449 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
451 m->openInputFile(it->first, in);
454 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group");
455 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
456 m->openOutputFile(thisGroupName, out);
459 if (m->control_pressed) { break; }
461 Sequence currSeq(in); m->gobble(in);
462 out << currSeq.getName() << '\t' << it->second << endl;
464 if (nameFile != "") {
465 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
466 if (itName != nameMap.end()) {
467 vector<string> thisSeqsNames;
468 m->splitAtChar(itName->second, thisSeqsNames, ',');
469 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
470 out << thisSeqsNames[k] << '\t' << it->second << endl;
472 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
480 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
482 //output group counts
483 m->mothurOutEndLine();
485 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
486 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
487 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
489 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
491 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
493 //set fasta file as new current fastafile
495 itTypes = outputTypes.find("fasta");
496 if (itTypes != outputTypes.end()) {
497 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
500 itTypes = outputTypes.find("name");
501 if (itTypes != outputTypes.end()) {
502 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
505 itTypes = outputTypes.find("qfile");
506 if (itTypes != outputTypes.end()) {
507 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
510 itTypes = outputTypes.find("group");
511 if (itTypes != outputTypes.end()) {
512 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
515 m->mothurOutEndLine();
516 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
517 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
518 m->mothurOutEndLine();
523 catch(exception& e) {
524 m->errorOut(e, "TrimSeqsCommand", "execute");
529 /**************************************************************************************/
531 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
535 ofstream trimFASTAFile;
536 m->openOutputFile(trimFileName, trimFASTAFile);
538 ofstream scrapFASTAFile;
539 m->openOutputFile(scrapFileName, scrapFASTAFile);
541 ofstream trimQualFile;
542 ofstream scrapQualFile;
544 m->openOutputFile(trimQFileName, trimQualFile);
545 m->openOutputFile(scrapQFileName, scrapQualFile);
548 ofstream trimNameFile;
549 ofstream scrapNameFile;
551 m->openOutputFile(trimNFileName, trimNameFile);
552 m->openOutputFile(scrapNFileName, scrapNameFile);
556 ofstream outGroupsFile;
557 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
559 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
560 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
561 if (fastaFileNames[i][j] != "") {
563 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
565 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
569 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
577 m->openInputFile(filename, inFASTA);
578 inFASTA.seekg(line.start);
581 if(qFileName != "") {
582 m->openInputFile(qFileName, qFile);
583 qFile.seekg(qline.start);
588 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
592 if (m->control_pressed) {
593 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
594 if (createGroup) { outGroupsFile.close(); }
599 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
605 string trashCode = "";
606 int currentSeqsDiffs = 0;
608 Sequence currSeq(inFASTA); m->gobble(inFASTA);
609 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
611 QualityScores currQual;
613 currQual = QualityScores(qFile); m->gobble(qFile);
616 string origSeq = currSeq.getUnaligned();
619 int barcodeIndex = 0;
623 success = trimOligos.stripLinker(currSeq, currQual);
624 if(success > ldiffs) { trashCode += 'k'; }
625 else{ currentSeqsDiffs += success; }
629 if(barcodes.size() != 0){
630 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
631 if(success > bdiffs) { trashCode += 'b'; }
632 else{ currentSeqsDiffs += success; }
635 if(rbarcodes.size() != 0){
636 success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
637 if(success > bdiffs) { trashCode += 'b'; }
638 else{ currentSeqsDiffs += success; }
642 success = trimOligos.stripSpacer(currSeq, currQual);
643 if(success > sdiffs) { trashCode += 's'; }
644 else{ currentSeqsDiffs += success; }
648 if(numFPrimers != 0){
649 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
650 if(success > pdiffs) { trashCode += 'f'; }
651 else{ currentSeqsDiffs += success; }
654 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
656 if(numRPrimers != 0){
657 success = trimOligos.stripReverse(currSeq, currQual);
658 if(!success) { trashCode += 'r'; }
662 success = keepFirstTrim(currSeq, currQual);
666 success = removeLastTrim(currSeq, currQual);
667 if(!success) { trashCode += 'l'; }
672 int origLength = currSeq.getNumBases();
674 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
675 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
676 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
677 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
678 else { success = 1; }
680 //you don't want to trim, if it fails above then scrap it
681 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
683 if(!success) { trashCode += 'q'; }
686 if(minLength > 0 || maxLength > 0){
687 success = cullLength(currSeq);
688 if(!success) { trashCode += 'l'; }
691 success = cullHomoP(currSeq);
692 if(!success) { trashCode += 'h'; }
695 success = cullAmbigs(currSeq);
696 if(!success) { trashCode += 'n'; }
699 if(flip){ // should go last
700 currSeq.reverseComplement();
702 currQual.flipQScores();
706 if(trashCode.length() == 0){
707 currSeq.setAligned(currSeq.getUnaligned());
708 currSeq.printSequence(trimFASTAFile);
711 currQual.printQScores(trimQualFile);
716 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
717 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
718 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
722 if(barcodes.size() != 0){
723 string thisGroup = barcodeNameVector[barcodeIndex];
724 if (primers.size() != 0) {
725 if (primerNameVector[primerIndex] != "") {
726 if(thisGroup != "") {
727 thisGroup += "." + primerNameVector[primerIndex];
729 thisGroup = primerNameVector[primerIndex];
734 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
736 int numRedundants = 0;
737 if (nameFile != "") {
738 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
739 if (itName != nameMap.end()) {
740 vector<string> thisSeqsNames;
741 m->splitAtChar(itName->second, thisSeqsNames, ',');
742 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
743 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
744 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
746 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
749 map<string, int>::iterator it = groupCounts.find(thisGroup);
750 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
751 else { groupCounts[it->first] += (1 + numRedundants); }
758 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
759 currSeq.printSequence(output);
763 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
764 currQual.printQScores(output);
769 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
770 if (itName != nameMap.end()) {
771 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
772 output << itName->first << '\t' << itName->second << endl;
774 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
779 if(nameFile != ""){ //needs to be before the currSeq name is changed
780 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
781 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
782 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
784 currSeq.setName(currSeq.getName() + '|' + trashCode);
785 currSeq.setUnaligned(origSeq);
786 currSeq.setAligned(origSeq);
787 currSeq.printSequence(scrapFASTAFile);
789 currQual.printQScores(scrapQualFile);
795 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
796 unsigned long long pos = inFASTA.tellg();
797 if ((pos == -1) || (pos >= line.end)) { break; }
800 if (inFASTA.eof()) { break; }
804 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
808 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
812 trimFASTAFile.close();
813 scrapFASTAFile.close();
814 if (createGroup) { outGroupsFile.close(); }
815 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
816 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
820 catch(exception& e) {
821 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
826 /**************************************************************************************************/
828 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
835 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
836 //loop through and create all the processes you want
837 while (process != processors) {
841 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
845 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
846 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
847 vector<vector<string> > tempNameFileNames = nameFileNames;
852 for(int i=0;i<tempFASTAFileNames.size();i++){
853 for(int j=0;j<tempFASTAFileNames[i].size();j++){
854 if (tempFASTAFileNames[i][j] != "") {
855 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
856 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
859 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
860 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
863 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
864 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
871 driverCreateTrim(filename,
873 (trimFASTAFileName + toString(getpid()) + ".temp"),
874 (scrapFASTAFileName + toString(getpid()) + ".temp"),
875 (trimQualFileName + toString(getpid()) + ".temp"),
876 (scrapQualFileName + toString(getpid()) + ".temp"),
877 (trimNameFileName + toString(getpid()) + ".temp"),
878 (scrapNameFileName + toString(getpid()) + ".temp"),
879 (groupFile + toString(getpid()) + ".temp"),
881 tempPrimerQualFileNames,
886 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
888 //pass groupCounts to parent
891 string tempFile = filename + toString(getpid()) + ".num.temp";
892 m->openOutputFile(tempFile, out);
894 out << groupCounts.size() << endl;
896 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
897 out << it->first << '\t' << it->second << endl;
903 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
904 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
911 m->openOutputFile(trimFASTAFileName, temp); temp.close();
912 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
914 m->openOutputFile(trimQualFileName, temp); temp.close();
915 m->openOutputFile(scrapQualFileName, temp); temp.close();
917 if (nameFile != "") {
918 m->openOutputFile(trimNameFileName, temp); temp.close();
919 m->openOutputFile(scrapNameFileName, temp); temp.close();
922 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
924 //force parent to wait until all the processes are done
925 for (int i=0;i<processIDS.size();i++) {
926 int temp = processIDS[i];
930 //////////////////////////////////////////////////////////////////////////////////////////////////////
931 //Windows version shared memory, so be careful when passing variables through the trimData struct.
932 //Above fork() will clone, so memory is separate, but that's not the case with windows,
933 //////////////////////////////////////////////////////////////////////////////////////////////////////
935 vector<trimData*> pDataArray;
936 DWORD dwThreadIdArray[processors-1];
937 HANDLE hThreadArray[processors-1];
939 //Create processor worker threads.
940 for( int i=0; i<processors-1; i++){
942 string extension = "";
943 if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
944 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
945 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
946 vector<vector<string> > tempNameFileNames = nameFileNames;
951 for(int i=0;i<tempFASTAFileNames.size();i++){
952 for(int j=0;j<tempFASTAFileNames[i].size();j++){
953 if (tempFASTAFileNames[i][j] != "") {
954 tempFASTAFileNames[i][j] += extension;
955 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
958 tempPrimerQualFileNames[i][j] += extension;
959 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
962 tempNameFileNames[i][j] += extension;
963 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
971 trimData* tempTrim = new trimData(filename,
973 (trimFASTAFileName+extension),
974 (scrapFASTAFileName+extension),
975 (trimQualFileName+extension),
976 (scrapQualFileName+extension),
977 (trimNameFileName+extension),
978 (scrapNameFileName+extension),
979 (groupFile+extension),
981 tempPrimerQualFileNames,
983 lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
984 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
985 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
986 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
987 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
988 pDataArray.push_back(tempTrim);
990 hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
995 m->openOutputFile(trimFASTAFileName, temp); temp.close();
996 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
998 m->openOutputFile(trimQualFileName, temp); temp.close();
999 m->openOutputFile(scrapQualFileName, temp); temp.close();
1001 if (nameFile != "") {
1002 m->openOutputFile(trimNameFileName, temp); temp.close();
1003 m->openOutputFile(scrapNameFileName, temp); temp.close();
1006 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
1007 processIDS.push_back(processors-1);
1010 //Wait until all threads have terminated.
1011 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1013 //Close all thread handles and free memory allocations.
1014 for(int i=0; i < pDataArray.size(); i++){
1015 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
1016 map<string, int>::iterator it2 = groupCounts.find(it->first);
1017 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
1018 else { groupCounts[it->first] += it->second; }
1020 CloseHandle(hThreadArray[i]);
1021 delete pDataArray[i];
1028 for(int i=0;i<processIDS.size();i++){
1030 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1032 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1033 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1034 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1035 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1037 if(qFileName != ""){
1038 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1039 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1040 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1041 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1045 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1046 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1047 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1048 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1052 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1053 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1058 for(int j=0;j<fastaFileNames.size();j++){
1059 for(int k=0;k<fastaFileNames[j].size();k++){
1060 if (fastaFileNames[j][k] != "") {
1061 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1062 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1064 if(qFileName != ""){
1065 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1066 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1070 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1071 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1078 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1081 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1082 m->openInputFile(tempFile, in);
1086 in >> tempNum; m->gobble(in);
1090 in >> group >> tempNum; m->gobble(in);
1092 map<string, int>::iterator it = groupCounts.find(group);
1093 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
1094 else { groupCounts[it->first] += tempNum; }
1097 in.close(); m->mothurRemove(tempFile);
1104 catch(exception& e) {
1105 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1110 /**************************************************************************************************/
1112 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1115 vector<unsigned long long> fastaFilePos;
1116 vector<unsigned long long> qfileFilePos;
1118 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1119 //set file positions for fasta file
1120 fastaFilePos = m->divideFile(filename, processors);
1122 //get name of first sequence in each chunk
1123 map<string, int> firstSeqNames;
1124 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1126 m->openInputFile(filename, in);
1127 in.seekg(fastaFilePos[i]);
1130 firstSeqNames[temp.getName()] = i;
1135 if(qfilename != "") {
1136 //seach for filePos of each first name in the qfile and save in qfileFilePos
1138 m->openInputFile(qfilename, inQual);
1141 while(!inQual.eof()){
1142 input = m->getline(inQual);
1144 if (input.length() != 0) {
1145 if(input[0] == '>'){ //this is a sequence name line
1146 istringstream nameStream(input);
1148 string sname = ""; nameStream >> sname;
1149 sname = sname.substr(1);
1151 map<string, int>::iterator it = firstSeqNames.find(sname);
1153 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1154 unsigned long long pos = inQual.tellg();
1155 qfileFilePos.push_back(pos - input.length() - 1);
1156 firstSeqNames.erase(it);
1161 if (firstSeqNames.size() == 0) { break; }
1166 if (firstSeqNames.size() != 0) {
1167 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1168 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1174 //get last file position of qfile
1176 unsigned long long size;
1178 //get num bytes in file
1179 pFile = fopen (qfilename.c_str(),"rb");
1180 if (pFile==NULL) perror ("Error opening file");
1182 fseek (pFile, 0, SEEK_END);
1187 qfileFilePos.push_back(size);
1190 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1191 if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); }
1192 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1193 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1195 if(qfilename == "") { qLines = lines; } //files with duds
1201 if (processors == 1) { //save time
1202 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1203 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1204 lines.push_back(linePair(0, 1000));
1205 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1207 int numFastaSeqs = 0;
1208 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1209 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1211 if (qfilename != "") {
1212 int numQualSeqs = 0;
1213 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1215 if (numFastaSeqs != numQualSeqs) {
1216 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1220 //figure out how many sequences you have to process
1221 int numSeqsPerProcessor = numFastaSeqs / processors;
1222 for (int i = 0; i < processors; i++) {
1223 int startIndex = i * numSeqsPerProcessor;
1224 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1225 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1226 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1229 if(qfilename == "") { qLines = lines; } //files with duds
1234 catch(exception& e) {
1235 m->errorOut(e, "TrimSeqsCommand", "setLines");
1240 //***************************************************************************************************************
1242 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1245 m->openInputFile(oligoFile, inOligos);
1249 string type, oligo, group;
1251 int indexPrimer = 0;
1252 int indexBarcode = 0;
1254 while(!inOligos.eof()){
1259 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1260 m->gobble(inOligos);
1263 m->gobble(inOligos);
1264 //make type case insensitive
1265 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1269 for(int i=0;i<oligo.length();i++){
1270 oligo[i] = toupper(oligo[i]);
1271 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1274 if(type == "FORWARD"){
1277 // get rest of line in case there is a primer name
1278 while (!inOligos.eof()) {
1279 char c = inOligos.get();
1280 if (c == 10 || c == 13){ break; }
1281 else if (c == 32 || c == 9){;} //space or tab
1282 else { group += c; }
1285 //check for repeat barcodes
1286 map<string, int>::iterator itPrime = primers.find(oligo);
1287 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1289 primers[oligo]=indexPrimer; indexPrimer++;
1290 primerNameVector.push_back(group);
1292 else if(type == "REVERSE"){
1293 //Sequence oligoRC("reverse", oligo);
1294 //oligoRC.reverseComplement();
1295 string oligoRC = reverseOligo(oligo);
1296 revPrimer.push_back(oligoRC);
1298 else if(type == "BARCODE"){
1301 //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
1302 //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
1304 while (!inOligos.eof()) {
1305 char c = inOligos.get();
1306 if (c == 10 || c == 13){ break; }
1307 else if (c == 32 || c == 9){;} //space or tab
1311 //then this is illumina data with 4 columns
1313 string reverseBarcode = reverseOligo(group); //reverse barcode
1316 //check for repeat barcodes
1317 map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
1318 if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); }
1320 rbarcodes[reverseBarcode]=indexBarcode;
1323 //check for repeat barcodes
1324 map<string, int>::iterator itBar = barcodes.find(oligo);
1325 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1327 barcodes[oligo]=indexBarcode; indexBarcode++;
1328 barcodeNameVector.push_back(group);
1329 }else if(type == "LINKER"){
1330 linker.push_back(oligo);
1331 }else if(type == "SPACER"){
1332 spacer.push_back(oligo);
1334 else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1336 m->gobble(inOligos);
1340 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1342 //add in potential combos
1343 if(barcodeNameVector.size() == 0){
1345 barcodeNameVector.push_back("");
1348 if(primerNameVector.size() == 0){
1350 primerNameVector.push_back("");
1353 fastaFileNames.resize(barcodeNameVector.size());
1354 for(int i=0;i<fastaFileNames.size();i++){
1355 fastaFileNames[i].assign(primerNameVector.size(), "");
1357 if(qFileName != "") { qualFileNames = fastaFileNames; }
1358 if(nameFile != "") { nameFileNames = fastaFileNames; }
1361 set<string> uniqueNames; //used to cleanup outputFileNames
1362 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1363 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1365 string primerName = primerNameVector[itPrimer->second];
1366 string barcodeName = barcodeNameVector[itBar->second];
1368 string comboGroupName = "";
1369 string fastaFileName = "";
1370 string qualFileName = "";
1371 string nameFileName = "";
1373 if(primerName == ""){
1374 comboGroupName = barcodeNameVector[itBar->second];
1377 if(barcodeName == ""){
1378 comboGroupName = primerNameVector[itPrimer->second];
1381 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1387 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1388 if (uniqueNames.count(fastaFileName) == 0) {
1389 outputNames.push_back(fastaFileName);
1390 outputTypes["fasta"].push_back(fastaFileName);
1391 uniqueNames.insert(fastaFileName);
1394 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1395 m->openOutputFile(fastaFileName, temp); temp.close();
1397 if(qFileName != ""){
1398 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1399 if (uniqueNames.count(qualFileName) == 0) {
1400 outputNames.push_back(qualFileName);
1401 outputTypes["qfile"].push_back(qualFileName);
1404 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1405 m->openOutputFile(qualFileName, temp); temp.close();
1409 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1410 if (uniqueNames.count(nameFileName) == 0) {
1411 outputNames.push_back(nameFileName);
1412 outputTypes["name"].push_back(nameFileName);
1415 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1416 m->openOutputFile(nameFileName, temp); temp.close();
1422 numFPrimers = primers.size();
1423 numRPrimers = revPrimer.size();
1424 numLinkers = linker.size();
1425 numSpacers = spacer.size();
1427 bool allBlank = true;
1428 for (int i = 0; i < barcodeNameVector.size(); i++) {
1429 if (barcodeNameVector[i] != "") {
1434 for (int i = 0; i < primerNameVector.size(); i++) {
1435 if (primerNameVector[i] != "") {
1442 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1450 catch(exception& e) {
1451 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1455 //***************************************************************************************************************
1457 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1460 if(qscores.getName() != ""){
1461 qscores.trimQScores(-1, keepFirst);
1463 sequence.trim(keepFirst);
1466 catch(exception& e) {
1467 m->errorOut(e, "keepFirstTrim", "countDiffs");
1473 //***************************************************************************************************************
1475 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1479 int length = sequence.getNumBases() - removeLast;
1482 if(qscores.getName() != ""){
1483 qscores.trimQScores(-1, length);
1485 sequence.trim(length);
1494 catch(exception& e) {
1495 m->errorOut(e, "removeLastTrim", "countDiffs");
1501 //***************************************************************************************************************
1503 bool TrimSeqsCommand::cullLength(Sequence& seq){
1506 int length = seq.getNumBases();
1507 bool success = 0; //guilty until proven innocent
1509 if(length >= minLength && maxLength == 0) { success = 1; }
1510 else if(length >= minLength && length <= maxLength) { success = 1; }
1511 else { success = 0; }
1516 catch(exception& e) {
1517 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1523 //***************************************************************************************************************
1525 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1527 int longHomoP = seq.getLongHomoPolymer();
1528 bool success = 0; //guilty until proven innocent
1530 if(longHomoP <= maxHomoP){ success = 1; }
1531 else { success = 0; }
1535 catch(exception& e) {
1536 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1541 //********************************************************************/
1542 string TrimSeqsCommand::reverseOligo(string oligo){
1544 string reverse = "";
1546 for(int i=oligo.length()-1;i>=0;i--){
1548 if(oligo[i] == 'A') { reverse += 'T'; }
1549 else if(oligo[i] == 'T'){ reverse += 'A'; }
1550 else if(oligo[i] == 'U'){ reverse += 'A'; }
1552 else if(oligo[i] == 'G'){ reverse += 'C'; }
1553 else if(oligo[i] == 'C'){ reverse += 'G'; }
1555 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1556 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1558 else if(oligo[i] == 'M'){ reverse += 'K'; }
1559 else if(oligo[i] == 'K'){ reverse += 'M'; }
1561 else if(oligo[i] == 'W'){ reverse += 'W'; }
1562 else if(oligo[i] == 'S'){ reverse += 'S'; }
1564 else if(oligo[i] == 'B'){ reverse += 'V'; }
1565 else if(oligo[i] == 'V'){ reverse += 'B'; }
1567 else if(oligo[i] == 'D'){ reverse += 'H'; }
1568 else if(oligo[i] == 'H'){ reverse += 'D'; }
1570 else { reverse += 'N'; }
1576 catch(exception& e) {
1577 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1582 //***************************************************************************************************************
1584 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1586 int numNs = seq.getAmbigBases();
1587 bool success = 0; //guilty until proven innocent
1589 if(numNs <= maxAmbig) { success = 1; }
1590 else { success = 0; }
1594 catch(exception& e) {
1595 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1600 //***************************************************************************************************************