5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
14 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
27 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
28 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
29 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
30 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
31 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
32 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
33 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
34 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
35 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
36 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
37 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
38 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
39 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
40 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
41 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
42 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
43 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
44 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
46 vector<string> myArray;
47 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
51 m->errorOut(e, "TrimSeqsCommand", "setParameters");
55 //**********************************************************************************************************************
56 string TrimSeqsCommand::getHelpString(){
58 string helpString = "";
59 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
60 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
61 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
62 helpString += "The fasta parameter is required.\n";
63 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
64 helpString += "The oligos parameter allows you to provide an oligos file.\n";
65 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
66 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
67 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
68 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
69 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
70 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
71 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
72 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
73 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
74 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
75 helpString += "The qfile parameter allows you to provide a quality file.\n";
76 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
77 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
78 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
79 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
80 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
81 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
82 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
83 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
84 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
85 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
86 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
87 helpString += "The trim.seqs command should be in the following format: \n";
88 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
89 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
90 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
91 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
92 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
96 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
102 //**********************************************************************************************************************
104 TrimSeqsCommand::TrimSeqsCommand(){
106 abort = true; calledHelp = true;
108 vector<string> tempOutNames;
109 outputTypes["fasta"] = tempOutNames;
110 outputTypes["qfile"] = tempOutNames;
111 outputTypes["group"] = tempOutNames;
112 outputTypes["name"] = tempOutNames;
114 catch(exception& e) {
115 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
119 //***************************************************************************************************************
121 TrimSeqsCommand::TrimSeqsCommand(string option) {
124 abort = false; calledHelp = false;
127 //allow user to run help
128 if(option == "help") { help(); abort = true; calledHelp = true; }
129 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
132 vector<string> myArray = setParameters();
134 OptionParser parser(option);
135 map<string,string> parameters = parser.getParameters();
137 ValidParameters validParameter;
138 map<string,string>::iterator it;
140 //check to make sure all parameters are valid for command
141 for (it = parameters.begin(); it != parameters.end(); it++) {
142 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
145 //initialize outputTypes
146 vector<string> tempOutNames;
147 outputTypes["fasta"] = tempOutNames;
148 outputTypes["qfile"] = tempOutNames;
149 outputTypes["group"] = tempOutNames;
150 outputTypes["name"] = tempOutNames;
152 //if the user changes the input directory command factory will send this info to us in the output parameter
153 string inputDir = validParameter.validFile(parameters, "inputdir", false);
154 if (inputDir == "not found"){ inputDir = ""; }
157 it = parameters.find("fasta");
158 //user has given a template file
159 if(it != parameters.end()){
160 path = m->hasPath(it->second);
161 //if the user has not given a path then, add inputdir. else leave path alone.
162 if (path == "") { parameters["fasta"] = inputDir + it->second; }
165 it = parameters.find("oligos");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["oligos"] = inputDir + it->second; }
173 it = parameters.find("qfile");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["qfile"] = inputDir + it->second; }
181 it = parameters.find("name");
182 //user has given a template file
183 if(it != parameters.end()){
184 path = m->hasPath(it->second);
185 //if the user has not given a path then, add inputdir. else leave path alone.
186 if (path == "") { parameters["name"] = inputDir + it->second; }
192 //check for required parameters
193 fastaFile = validParameter.validFile(parameters, "fasta", true);
194 if (fastaFile == "not found") {
195 fastaFile = m->getFastaFile();
196 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
197 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
198 }else if (fastaFile == "not open") { abort = true; }
199 else { m->setFastaFile(fastaFile); }
201 //if the user changes the output directory command factory will send this info to us in the output parameter
202 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
204 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
208 //check for optional parameter and set defaults
209 // ...at some point should added some additional type checking...
211 temp = validParameter.validFile(parameters, "flip", false);
212 if (temp == "not found") { flip = 0; }
213 else { flip = m->isTrue(temp); }
215 temp = validParameter.validFile(parameters, "oligos", true);
216 if (temp == "not found"){ oligoFile = ""; }
217 else if(temp == "not open"){ abort = true; }
218 else { oligoFile = temp; m->setOligosFile(oligoFile); }
221 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
222 m->mothurConvert(temp, maxAmbig);
224 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
225 m->mothurConvert(temp, maxHomoP);
227 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
228 m->mothurConvert(temp, minLength);
230 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
231 m->mothurConvert(temp, maxLength);
233 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
234 m->mothurConvert(temp, bdiffs);
236 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
237 m->mothurConvert(temp, pdiffs);
239 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
240 m->mothurConvert(temp, ldiffs);
242 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
243 m->mothurConvert(temp, sdiffs);
245 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
246 m->mothurConvert(temp, tdiffs);
248 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
250 temp = validParameter.validFile(parameters, "qfile", true);
251 if (temp == "not found") { qFileName = ""; }
252 else if(temp == "not open") { abort = true; }
253 else { qFileName = temp; m->setQualFile(qFileName); }
255 temp = validParameter.validFile(parameters, "name", true);
256 if (temp == "not found") { nameFile = ""; }
257 else if(temp == "not open") { nameFile = ""; abort = true; }
258 else { nameFile = temp; m->setNameFile(nameFile); }
260 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
261 m->mothurConvert(temp, qThreshold);
263 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
264 qtrim = m->isTrue(temp);
266 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
267 convert(temp, qRollAverage);
269 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
270 convert(temp, qWindowAverage);
272 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
273 convert(temp, qWindowSize);
275 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
276 convert(temp, qWindowStep);
278 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
279 convert(temp, qAverage);
281 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
282 convert(temp, keepFirst);
284 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
285 convert(temp, removeLast);
287 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
288 allFiles = m->isTrue(temp);
290 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
291 keepforward = m->isTrue(temp);
293 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
294 m->setProcessors(temp);
295 m->mothurConvert(temp, processors);
298 if(allFiles && (oligoFile == "")){
299 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
301 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
302 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
306 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
307 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
311 if (nameFile == "") {
312 vector<string> files; files.push_back(fastaFile);
313 parser.getNameFile(files);
318 catch(exception& e) {
319 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
323 //***************************************************************************************************************
325 int TrimSeqsCommand::execute(){
328 if (abort == true) { if (calledHelp) { return 0; } return 2; }
330 numFPrimers = 0; //this needs to be initialized
333 vector<vector<string> > fastaFileNames;
334 vector<vector<string> > qualFileNames;
335 vector<vector<string> > nameFileNames;
337 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
338 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
340 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
341 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
343 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
344 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
346 if (qFileName != "") {
347 outputNames.push_back(trimQualFile);
348 outputNames.push_back(scrapQualFile);
349 outputTypes["qfile"].push_back(trimQualFile);
350 outputTypes["qfile"].push_back(scrapQualFile);
353 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
354 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
356 if (nameFile != "") {
357 m->readNames(nameFile, nameMap);
358 outputNames.push_back(trimNameFile);
359 outputNames.push_back(scrapNameFile);
360 outputTypes["name"].push_back(trimNameFile);
361 outputTypes["name"].push_back(scrapNameFile);
364 if (m->control_pressed) { return 0; }
366 string outputGroupFileName;
368 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
370 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
371 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
375 //fills lines and qlines
376 setLines(fastaFile, qFileName);
379 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
381 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
385 if (m->control_pressed) { return 0; }
388 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
389 map<string, string>::iterator it;
390 set<string> namesToRemove;
391 for(int i=0;i<fastaFileNames.size();i++){
392 for(int j=0;j<fastaFileNames[0].size();j++){
393 if (fastaFileNames[i][j] != "") {
394 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
395 if(m->isBlank(fastaFileNames[i][j])){
396 m->mothurRemove(fastaFileNames[i][j]);
397 namesToRemove.insert(fastaFileNames[i][j]);
400 m->mothurRemove(qualFileNames[i][j]);
401 namesToRemove.insert(qualFileNames[i][j]);
405 m->mothurRemove(nameFileNames[i][j]);
406 namesToRemove.insert(nameFileNames[i][j]);
409 it = uniqueFastaNames.find(fastaFileNames[i][j]);
410 if (it == uniqueFastaNames.end()) {
411 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
419 //remove names for outputFileNames, just cleans up the output
420 vector<string> outputNames2;
421 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
422 outputNames = outputNames2;
424 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
426 m->openInputFile(it->first, in);
429 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
430 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
431 m->openOutputFile(thisGroupName, out);
434 if (m->control_pressed) { break; }
436 Sequence currSeq(in); m->gobble(in);
437 out << currSeq.getName() << '\t' << it->second << endl;
444 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
446 //output group counts
447 m->mothurOutEndLine();
449 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
450 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
451 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
453 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
455 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
457 //set fasta file as new current fastafile
459 itTypes = outputTypes.find("fasta");
460 if (itTypes != outputTypes.end()) {
461 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
464 itTypes = outputTypes.find("name");
465 if (itTypes != outputTypes.end()) {
466 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
469 itTypes = outputTypes.find("qfile");
470 if (itTypes != outputTypes.end()) {
471 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
474 itTypes = outputTypes.find("group");
475 if (itTypes != outputTypes.end()) {
476 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
479 m->mothurOutEndLine();
480 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
481 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
482 m->mothurOutEndLine();
487 catch(exception& e) {
488 m->errorOut(e, "TrimSeqsCommand", "execute");
493 /**************************************************************************************/
495 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
499 ofstream trimFASTAFile;
500 m->openOutputFile(trimFileName, trimFASTAFile);
502 ofstream scrapFASTAFile;
503 m->openOutputFile(scrapFileName, scrapFASTAFile);
505 ofstream trimQualFile;
506 ofstream scrapQualFile;
508 m->openOutputFile(trimQFileName, trimQualFile);
509 m->openOutputFile(scrapQFileName, scrapQualFile);
512 ofstream trimNameFile;
513 ofstream scrapNameFile;
515 m->openOutputFile(trimNFileName, trimNameFile);
516 m->openOutputFile(scrapNFileName, scrapNameFile);
520 ofstream outGroupsFile;
521 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
523 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
524 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
525 if (fastaFileNames[i][j] != "") {
527 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
529 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
533 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
541 m->openInputFile(filename, inFASTA);
542 inFASTA.seekg(line.start);
545 if(qFileName != "") {
546 m->openInputFile(qFileName, qFile);
547 qFile.seekg(qline.start);
552 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
556 if (m->control_pressed) {
557 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
558 if (createGroup) { outGroupsFile.close(); }
563 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
569 string trashCode = "";
570 int currentSeqsDiffs = 0;
572 Sequence currSeq(inFASTA); m->gobble(inFASTA);
573 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
574 QualityScores currQual;
576 currQual = QualityScores(qFile); m->gobble(qFile);
579 string origSeq = currSeq.getUnaligned();
582 int barcodeIndex = 0;
586 success = trimOligos.stripLinker(currSeq, currQual);
587 if(success > ldiffs) { trashCode += 'k'; }
588 else{ currentSeqsDiffs += success; }
592 if(barcodes.size() != 0){
593 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
594 if(success > bdiffs) { trashCode += 'b'; }
595 else{ currentSeqsDiffs += success; }
599 success = trimOligos.stripSpacer(currSeq, currQual);
600 if(success > sdiffs) { trashCode += 's'; }
601 else{ currentSeqsDiffs += success; }
605 if(numFPrimers != 0){
606 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
607 if(success > pdiffs) { trashCode += 'f'; }
608 else{ currentSeqsDiffs += success; }
611 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
613 if(numRPrimers != 0){
614 success = trimOligos.stripReverse(currSeq, currQual);
615 if(!success) { trashCode += 'r'; }
619 success = keepFirstTrim(currSeq, currQual);
623 success = removeLastTrim(currSeq, currQual);
624 if(!success) { trashCode += 'l'; }
629 int origLength = currSeq.getNumBases();
631 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
632 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
633 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
634 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
635 else { success = 1; }
637 //you don't want to trim, if it fails above then scrap it
638 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
640 if(!success) { trashCode += 'q'; }
643 if(minLength > 0 || maxLength > 0){
644 success = cullLength(currSeq);
645 if(!success) { trashCode += 'l'; }
648 success = cullHomoP(currSeq);
649 if(!success) { trashCode += 'h'; }
652 success = cullAmbigs(currSeq);
653 if(!success) { trashCode += 'n'; }
656 if(flip){ // should go last
657 currSeq.reverseComplement();
659 currQual.flipQScores();
663 if(trashCode.length() == 0){
664 currSeq.setAligned(currSeq.getUnaligned());
665 currSeq.printSequence(trimFASTAFile);
668 currQual.printQScores(trimQualFile);
672 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
673 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
674 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
678 if(barcodes.size() != 0){
679 string thisGroup = barcodeNameVector[barcodeIndex];
680 if (primers.size() != 0) {
681 if (primerNameVector[primerIndex] != "") {
682 if(thisGroup != "") {
683 thisGroup += "." + primerNameVector[primerIndex];
685 thisGroup = primerNameVector[primerIndex];
690 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
692 if (nameFile != "") {
693 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
694 if (itName != nameMap.end()) {
695 vector<string> thisSeqsNames;
696 m->splitAtChar(itName->second, thisSeqsNames, ',');
697 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
698 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
700 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
703 map<string, int>::iterator it = groupCounts.find(thisGroup);
704 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
705 else { groupCounts[it->first]++; }
712 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
713 currSeq.printSequence(output);
717 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
718 currQual.printQScores(output);
723 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
724 if (itName != nameMap.end()) {
725 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
726 output << itName->first << '\t' << itName->second << endl;
728 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
733 if(nameFile != ""){ //needs to be before the currSeq name is changed
734 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
735 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
736 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
738 currSeq.setName(currSeq.getName() + '|' + trashCode);
739 currSeq.setUnaligned(origSeq);
740 currSeq.setAligned(origSeq);
741 currSeq.printSequence(scrapFASTAFile);
743 currQual.printQScores(scrapQualFile);
749 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
750 unsigned long long pos = inFASTA.tellg();
751 if ((pos == -1) || (pos >= line.end)) { break; }
754 if (inFASTA.eof()) { break; }
758 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
762 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
766 trimFASTAFile.close();
767 scrapFASTAFile.close();
768 if (createGroup) { outGroupsFile.close(); }
769 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
770 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
774 catch(exception& e) {
775 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
780 /**************************************************************************************************/
782 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
789 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
790 //loop through and create all the processes you want
791 while (process != processors) {
795 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
799 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
800 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
801 vector<vector<string> > tempNameFileNames = nameFileNames;
806 for(int i=0;i<tempFASTAFileNames.size();i++){
807 for(int j=0;j<tempFASTAFileNames[i].size();j++){
808 if (tempFASTAFileNames[i][j] != "") {
809 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
810 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
813 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
814 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
817 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
818 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
825 driverCreateTrim(filename,
827 (trimFASTAFileName + toString(getpid()) + ".temp"),
828 (scrapFASTAFileName + toString(getpid()) + ".temp"),
829 (trimQualFileName + toString(getpid()) + ".temp"),
830 (scrapQualFileName + toString(getpid()) + ".temp"),
831 (trimNameFileName + toString(getpid()) + ".temp"),
832 (scrapNameFileName + toString(getpid()) + ".temp"),
833 (groupFile + toString(getpid()) + ".temp"),
835 tempPrimerQualFileNames,
840 //pass groupCounts to parent
843 string tempFile = filename + toString(getpid()) + ".num.temp";
844 m->openOutputFile(tempFile, out);
846 out << groupCounts.size() << endl;
848 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
849 out << it->first << '\t' << it->second << endl;
855 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
856 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
863 m->openOutputFile(trimFASTAFileName, temp); temp.close();
864 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
866 m->openOutputFile(trimQualFileName, temp); temp.close();
867 m->openOutputFile(scrapQualFileName, temp); temp.close();
869 if (nameFile != "") {
870 m->openOutputFile(trimNameFileName, temp); temp.close();
871 m->openOutputFile(scrapNameFileName, temp); temp.close();
874 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
876 //force parent to wait until all the processes are done
877 for (int i=0;i<processIDS.size();i++) {
878 int temp = processIDS[i];
882 //////////////////////////////////////////////////////////////////////////////////////////////////////
883 //Windows version shared memory, so be careful when passing variables through the trimData struct.
884 //Above fork() will clone, so memory is separate, but that's not the case with windows,
885 //////////////////////////////////////////////////////////////////////////////////////////////////////
887 vector<trimData*> pDataArray;
888 DWORD dwThreadIdArray[processors-1];
889 HANDLE hThreadArray[processors-1];
891 //Create processor worker threads.
892 for( int i=0; i<processors-1; i++){
894 string extension = "";
895 if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
896 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
897 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
898 vector<vector<string> > tempNameFileNames = nameFileNames;
903 for(int i=0;i<tempFASTAFileNames.size();i++){
904 for(int j=0;j<tempFASTAFileNames[i].size();j++){
905 if (tempFASTAFileNames[i][j] != "") {
906 tempFASTAFileNames[i][j] += extension;
907 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
910 tempPrimerQualFileNames[i][j] += extension;
911 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
914 tempNameFileNames[i][j] += extension;
915 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
923 trimData* tempTrim = new trimData(filename,
925 (trimFASTAFileName+extension),
926 (scrapFASTAFileName+extension),
927 (trimQualFileName+extension),
928 (scrapQualFileName+extension),
929 (trimNameFileName+extension),
930 (scrapNameFileName+extension),
931 (groupFile+extension),
933 tempPrimerQualFileNames,
935 lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
936 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
937 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
938 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
939 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
940 pDataArray.push_back(tempTrim);
942 hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
947 m->openOutputFile(trimFASTAFileName, temp); temp.close();
948 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
950 m->openOutputFile(trimQualFileName, temp); temp.close();
951 m->openOutputFile(scrapQualFileName, temp); temp.close();
953 if (nameFile != "") {
954 m->openOutputFile(trimNameFileName, temp); temp.close();
955 m->openOutputFile(scrapNameFileName, temp); temp.close();
958 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
959 processIDS.push_back(processors-1);
962 //Wait until all threads have terminated.
963 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
965 //Close all thread handles and free memory allocations.
966 for(int i=0; i < pDataArray.size(); i++){
967 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
968 map<string, int>::iterator it2 = groupCounts.find(it->first);
969 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
970 else { groupCounts[it->first] += it->second; }
972 CloseHandle(hThreadArray[i]);
973 delete pDataArray[i];
980 for(int i=0;i<processIDS.size();i++){
982 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
984 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
985 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
986 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
987 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
990 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
991 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
992 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
993 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
997 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
998 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
999 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1000 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1004 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1005 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1010 for(int j=0;j<fastaFileNames.size();j++){
1011 for(int k=0;k<fastaFileNames[j].size();k++){
1012 if (fastaFileNames[j][k] != "") {
1013 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1014 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1016 if(qFileName != ""){
1017 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1018 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1022 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1023 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1030 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1033 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1034 m->openInputFile(tempFile, in);
1038 in >> tempNum; m->gobble(in);
1042 in >> group >> tempNum; m->gobble(in);
1044 map<string, int>::iterator it = groupCounts.find(group);
1045 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
1046 else { groupCounts[it->first] += tempNum; }
1049 in.close(); m->mothurRemove(tempFile);
1056 catch(exception& e) {
1057 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1062 /**************************************************************************************************/
1064 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1067 vector<unsigned long long> fastaFilePos;
1068 vector<unsigned long long> qfileFilePos;
1070 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1071 //set file positions for fasta file
1072 fastaFilePos = m->divideFile(filename, processors);
1074 //get name of first sequence in each chunk
1075 map<string, int> firstSeqNames;
1076 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1078 m->openInputFile(filename, in);
1079 in.seekg(fastaFilePos[i]);
1082 firstSeqNames[temp.getName()] = i;
1087 if(qfilename != "") {
1088 //seach for filePos of each first name in the qfile and save in qfileFilePos
1090 m->openInputFile(qfilename, inQual);
1093 while(!inQual.eof()){
1094 input = m->getline(inQual);
1096 if (input.length() != 0) {
1097 if(input[0] == '>'){ //this is a sequence name line
1098 istringstream nameStream(input);
1100 string sname = ""; nameStream >> sname;
1101 sname = sname.substr(1);
1103 map<string, int>::iterator it = firstSeqNames.find(sname);
1105 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1106 unsigned long long pos = inQual.tellg();
1107 qfileFilePos.push_back(pos - input.length() - 1);
1108 firstSeqNames.erase(it);
1113 if (firstSeqNames.size() == 0) { break; }
1118 if (firstSeqNames.size() != 0) {
1119 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1120 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1126 //get last file position of qfile
1128 unsigned long long size;
1130 //get num bytes in file
1131 pFile = fopen (qfilename.c_str(),"rb");
1132 if (pFile==NULL) perror ("Error opening file");
1134 fseek (pFile, 0, SEEK_END);
1139 qfileFilePos.push_back(size);
1142 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1143 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1144 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1146 if(qfilename == "") { qLines = lines; } //files with duds
1152 if (processors == 1) { //save time
1153 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1154 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1155 lines.push_back(linePair(0, 1000));
1156 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1158 int numFastaSeqs = 0;
1159 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1160 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1162 if (qfilename != "") {
1163 int numQualSeqs = 0;
1164 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1166 if (numFastaSeqs != numQualSeqs) {
1167 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1171 //figure out how many sequences you have to process
1172 int numSeqsPerProcessor = numFastaSeqs / processors;
1173 for (int i = 0; i < processors; i++) {
1174 int startIndex = i * numSeqsPerProcessor;
1175 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1176 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1177 cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
1178 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1181 if(qfilename == "") { qLines = lines; } //files with duds
1187 catch(exception& e) {
1188 m->errorOut(e, "TrimSeqsCommand", "setLines");
1193 //***************************************************************************************************************
1195 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1198 m->openInputFile(oligoFile, inOligos);
1202 string type, oligo, group;
1204 int indexPrimer = 0;
1205 int indexBarcode = 0;
1207 while(!inOligos.eof()){
1212 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1213 m->gobble(inOligos);
1216 m->gobble(inOligos);
1217 //make type case insensitive
1218 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1222 for(int i=0;i<oligo.length();i++){
1223 oligo[i] = toupper(oligo[i]);
1224 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1227 if(type == "FORWARD"){
1230 // get rest of line in case there is a primer name
1231 while (!inOligos.eof()) {
1232 char c = inOligos.get();
1233 if (c == 10 || c == 13){ break; }
1234 else if (c == 32 || c == 9){;} //space or tab
1235 else { group += c; }
1238 //check for repeat barcodes
1239 map<string, int>::iterator itPrime = primers.find(oligo);
1240 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1242 primers[oligo]=indexPrimer; indexPrimer++;
1243 primerNameVector.push_back(group);
1245 else if(type == "REVERSE"){
1246 //Sequence oligoRC("reverse", oligo);
1247 //oligoRC.reverseComplement();
1248 string oligoRC = reverseOligo(oligo);
1249 revPrimer.push_back(oligoRC);
1251 else if(type == "BARCODE"){
1254 //check for repeat barcodes
1255 map<string, int>::iterator itBar = barcodes.find(oligo);
1256 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1258 barcodes[oligo]=indexBarcode; indexBarcode++;
1259 barcodeNameVector.push_back(group);
1260 }else if(type == "LINKER"){
1261 linker.push_back(oligo);
1262 }else if(type == "SPACER"){
1263 spacer.push_back(oligo);
1265 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1267 m->gobble(inOligos);
1271 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1273 //add in potential combos
1274 if(barcodeNameVector.size() == 0){
1276 barcodeNameVector.push_back("");
1279 if(primerNameVector.size() == 0){
1281 primerNameVector.push_back("");
1284 fastaFileNames.resize(barcodeNameVector.size());
1285 for(int i=0;i<fastaFileNames.size();i++){
1286 fastaFileNames[i].assign(primerNameVector.size(), "");
1288 if(qFileName != "") { qualFileNames = fastaFileNames; }
1289 if(nameFile != "") { nameFileNames = fastaFileNames; }
1292 set<string> uniqueNames; //used to cleanup outputFileNames
1293 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1294 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1296 string primerName = primerNameVector[itPrimer->second];
1297 string barcodeName = barcodeNameVector[itBar->second];
1299 string comboGroupName = "";
1300 string fastaFileName = "";
1301 string qualFileName = "";
1302 string nameFileName = "";
1304 if(primerName == ""){
1305 comboGroupName = barcodeNameVector[itBar->second];
1308 if(barcodeName == ""){
1309 comboGroupName = primerNameVector[itPrimer->second];
1312 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1318 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1319 if (uniqueNames.count(fastaFileName) == 0) {
1320 outputNames.push_back(fastaFileName);
1321 outputTypes["fasta"].push_back(fastaFileName);
1322 uniqueNames.insert(fastaFileName);
1325 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1326 m->openOutputFile(fastaFileName, temp); temp.close();
1328 if(qFileName != ""){
1329 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1330 if (uniqueNames.count(qualFileName) == 0) {
1331 outputNames.push_back(qualFileName);
1332 outputTypes["qfile"].push_back(qualFileName);
1335 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1336 m->openOutputFile(qualFileName, temp); temp.close();
1340 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1341 if (uniqueNames.count(nameFileName) == 0) {
1342 outputNames.push_back(nameFileName);
1343 outputTypes["name"].push_back(nameFileName);
1346 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1347 m->openOutputFile(nameFileName, temp); temp.close();
1353 numFPrimers = primers.size();
1354 numRPrimers = revPrimer.size();
1355 numLinkers = linker.size();
1356 numSpacers = spacer.size();
1358 bool allBlank = true;
1359 for (int i = 0; i < barcodeNameVector.size(); i++) {
1360 if (barcodeNameVector[i] != "") {
1365 for (int i = 0; i < primerNameVector.size(); i++) {
1366 if (primerNameVector[i] != "") {
1373 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1381 catch(exception& e) {
1382 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1386 //***************************************************************************************************************
1388 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1391 if(qscores.getName() != ""){
1392 qscores.trimQScores(-1, keepFirst);
1394 sequence.trim(keepFirst);
1397 catch(exception& e) {
1398 m->errorOut(e, "keepFirstTrim", "countDiffs");
1404 //***************************************************************************************************************
1406 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1410 int length = sequence.getNumBases() - removeLast;
1413 if(qscores.getName() != ""){
1414 qscores.trimQScores(-1, length);
1416 sequence.trim(length);
1425 catch(exception& e) {
1426 m->errorOut(e, "removeLastTrim", "countDiffs");
1432 //***************************************************************************************************************
1434 bool TrimSeqsCommand::cullLength(Sequence& seq){
1437 int length = seq.getNumBases();
1438 bool success = 0; //guilty until proven innocent
1440 if(length >= minLength && maxLength == 0) { success = 1; }
1441 else if(length >= minLength && length <= maxLength) { success = 1; }
1442 else { success = 0; }
1447 catch(exception& e) {
1448 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1454 //***************************************************************************************************************
1456 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1458 int longHomoP = seq.getLongHomoPolymer();
1459 bool success = 0; //guilty until proven innocent
1461 if(longHomoP <= maxHomoP){ success = 1; }
1462 else { success = 0; }
1466 catch(exception& e) {
1467 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1472 //********************************************************************/
1473 string TrimSeqsCommand::reverseOligo(string oligo){
1475 string reverse = "";
1477 for(int i=oligo.length()-1;i>=0;i--){
1479 if(oligo[i] == 'A') { reverse += 'T'; }
1480 else if(oligo[i] == 'T'){ reverse += 'A'; }
1481 else if(oligo[i] == 'U'){ reverse += 'A'; }
1483 else if(oligo[i] == 'G'){ reverse += 'C'; }
1484 else if(oligo[i] == 'C'){ reverse += 'G'; }
1486 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1487 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1489 else if(oligo[i] == 'M'){ reverse += 'K'; }
1490 else if(oligo[i] == 'K'){ reverse += 'M'; }
1492 else if(oligo[i] == 'W'){ reverse += 'W'; }
1493 else if(oligo[i] == 'S'){ reverse += 'S'; }
1495 else if(oligo[i] == 'B'){ reverse += 'V'; }
1496 else if(oligo[i] == 'V'){ reverse += 'B'; }
1498 else if(oligo[i] == 'D'){ reverse += 'H'; }
1499 else if(oligo[i] == 'H'){ reverse += 'D'; }
1501 else { reverse += 'N'; }
1507 catch(exception& e) {
1508 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1513 //***************************************************************************************************************
1515 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1517 int numNs = seq.getAmbigBases();
1518 bool success = 0; //guilty until proven innocent
1520 if(numNs <= maxAmbig) { success = 1; }
1521 else { success = 0; }
1525 catch(exception& e) {
1526 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1531 //***************************************************************************************************************