5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
14 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
27 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
28 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
29 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
30 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
31 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
32 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
33 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
34 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
35 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
36 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
37 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
38 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
39 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
40 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
41 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
42 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
43 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
44 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
46 vector<string> myArray;
47 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
51 m->errorOut(e, "TrimSeqsCommand", "setParameters");
55 //**********************************************************************************************************************
56 string TrimSeqsCommand::getHelpString(){
58 string helpString = "";
59 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
60 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
61 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
62 helpString += "The fasta parameter is required.\n";
63 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
64 helpString += "The oligos parameter allows you to provide an oligos file.\n";
65 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
66 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
67 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
68 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
69 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
70 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
71 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
72 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
73 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
74 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
75 helpString += "The qfile parameter allows you to provide a quality file.\n";
76 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
77 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
78 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
79 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
80 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
81 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
82 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
83 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
84 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
85 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
86 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
87 helpString += "The trim.seqs command should be in the following format: \n";
88 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
89 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
90 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
91 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
92 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
96 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
102 //**********************************************************************************************************************
104 TrimSeqsCommand::TrimSeqsCommand(){
106 abort = true; calledHelp = true;
108 vector<string> tempOutNames;
109 outputTypes["fasta"] = tempOutNames;
110 outputTypes["qfile"] = tempOutNames;
111 outputTypes["group"] = tempOutNames;
112 outputTypes["name"] = tempOutNames;
114 catch(exception& e) {
115 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
119 //***************************************************************************************************************
121 TrimSeqsCommand::TrimSeqsCommand(string option) {
124 abort = false; calledHelp = false;
127 //allow user to run help
128 if(option == "help") { help(); abort = true; calledHelp = true; }
129 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
132 vector<string> myArray = setParameters();
134 OptionParser parser(option);
135 map<string,string> parameters = parser.getParameters();
137 ValidParameters validParameter;
138 map<string,string>::iterator it;
140 //check to make sure all parameters are valid for command
141 for (it = parameters.begin(); it != parameters.end(); it++) {
142 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
145 //initialize outputTypes
146 vector<string> tempOutNames;
147 outputTypes["fasta"] = tempOutNames;
148 outputTypes["qfile"] = tempOutNames;
149 outputTypes["group"] = tempOutNames;
150 outputTypes["name"] = tempOutNames;
152 //if the user changes the input directory command factory will send this info to us in the output parameter
153 string inputDir = validParameter.validFile(parameters, "inputdir", false);
154 if (inputDir == "not found"){ inputDir = ""; }
157 it = parameters.find("fasta");
158 //user has given a template file
159 if(it != parameters.end()){
160 path = m->hasPath(it->second);
161 //if the user has not given a path then, add inputdir. else leave path alone.
162 if (path == "") { parameters["fasta"] = inputDir + it->second; }
165 it = parameters.find("oligos");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["oligos"] = inputDir + it->second; }
173 it = parameters.find("qfile");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["qfile"] = inputDir + it->second; }
181 it = parameters.find("name");
182 //user has given a template file
183 if(it != parameters.end()){
184 path = m->hasPath(it->second);
185 //if the user has not given a path then, add inputdir. else leave path alone.
186 if (path == "") { parameters["name"] = inputDir + it->second; }
192 //check for required parameters
193 fastaFile = validParameter.validFile(parameters, "fasta", true);
194 if (fastaFile == "not found") {
195 fastaFile = m->getFastaFile();
196 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
197 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
198 }else if (fastaFile == "not open") { abort = true; }
199 else { m->setFastaFile(fastaFile); }
201 //if the user changes the output directory command factory will send this info to us in the output parameter
202 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
204 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
208 //check for optional parameter and set defaults
209 // ...at some point should added some additional type checking...
211 temp = validParameter.validFile(parameters, "flip", false);
212 if (temp == "not found") { flip = 0; }
213 else { flip = m->isTrue(temp); }
215 temp = validParameter.validFile(parameters, "oligos", true);
216 if (temp == "not found"){ oligoFile = ""; }
217 else if(temp == "not open"){ abort = true; }
218 else { oligoFile = temp; m->setOligosFile(oligoFile); }
221 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
222 m->mothurConvert(temp, maxAmbig);
224 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
225 m->mothurConvert(temp, maxHomoP);
227 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
228 m->mothurConvert(temp, minLength);
230 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
231 m->mothurConvert(temp, maxLength);
233 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
234 m->mothurConvert(temp, bdiffs);
236 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
237 m->mothurConvert(temp, pdiffs);
239 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
240 m->mothurConvert(temp, ldiffs);
242 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
243 m->mothurConvert(temp, sdiffs);
245 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
246 m->mothurConvert(temp, tdiffs);
248 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
250 temp = validParameter.validFile(parameters, "qfile", true);
251 if (temp == "not found") { qFileName = ""; }
252 else if(temp == "not open") { abort = true; }
253 else { qFileName = temp; m->setQualFile(qFileName); }
255 temp = validParameter.validFile(parameters, "name", true);
256 if (temp == "not found") { nameFile = ""; }
257 else if(temp == "not open") { nameFile = ""; abort = true; }
258 else { nameFile = temp; m->setNameFile(nameFile); }
260 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
261 m->mothurConvert(temp, qThreshold);
263 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
264 qtrim = m->isTrue(temp);
266 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
267 convert(temp, qRollAverage);
269 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
270 convert(temp, qWindowAverage);
272 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
273 convert(temp, qWindowSize);
275 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
276 convert(temp, qWindowStep);
278 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
279 convert(temp, qAverage);
281 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
282 convert(temp, keepFirst);
284 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
285 convert(temp, removeLast);
287 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
288 allFiles = m->isTrue(temp);
290 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
291 keepforward = m->isTrue(temp);
293 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
294 m->setProcessors(temp);
295 m->mothurConvert(temp, processors);
298 if(allFiles && (oligoFile == "")){
299 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
301 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
302 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
306 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
307 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
311 if (nameFile == "") {
312 vector<string> files; files.push_back(fastaFile);
313 parser.getNameFile(files);
318 catch(exception& e) {
319 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
323 //***************************************************************************************************************
325 int TrimSeqsCommand::execute(){
328 if (abort == true) { if (calledHelp) { return 0; } return 2; }
330 numFPrimers = 0; //this needs to be initialized
335 vector<vector<string> > fastaFileNames;
336 vector<vector<string> > qualFileNames;
337 vector<vector<string> > nameFileNames;
339 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
340 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
342 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
343 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
345 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
346 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
348 if (qFileName != "") {
349 outputNames.push_back(trimQualFile);
350 outputNames.push_back(scrapQualFile);
351 outputTypes["qfile"].push_back(trimQualFile);
352 outputTypes["qfile"].push_back(scrapQualFile);
355 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
356 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
358 if (nameFile != "") {
359 m->readNames(nameFile, nameMap);
360 outputNames.push_back(trimNameFile);
361 outputNames.push_back(scrapNameFile);
362 outputTypes["name"].push_back(trimNameFile);
363 outputTypes["name"].push_back(scrapNameFile);
366 if (m->control_pressed) { return 0; }
368 string outputGroupFileName;
370 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
372 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
373 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
377 //fills lines and qlines
378 setLines(fastaFile, qFileName);
381 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
383 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
387 if (m->control_pressed) { return 0; }
390 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
391 map<string, string>::iterator it;
392 set<string> namesToRemove;
393 for(int i=0;i<fastaFileNames.size();i++){
394 for(int j=0;j<fastaFileNames[0].size();j++){
395 if (fastaFileNames[i][j] != "") {
396 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
397 if(m->isBlank(fastaFileNames[i][j])){
398 m->mothurRemove(fastaFileNames[i][j]);
399 namesToRemove.insert(fastaFileNames[i][j]);
402 m->mothurRemove(qualFileNames[i][j]);
403 namesToRemove.insert(qualFileNames[i][j]);
407 m->mothurRemove(nameFileNames[i][j]);
408 namesToRemove.insert(nameFileNames[i][j]);
411 it = uniqueFastaNames.find(fastaFileNames[i][j]);
412 if (it == uniqueFastaNames.end()) {
413 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
421 //remove names for outputFileNames, just cleans up the output
422 vector<string> outputNames2;
423 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
424 outputNames = outputNames2;
426 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
428 m->openInputFile(it->first, in);
431 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
432 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
433 m->openOutputFile(thisGroupName, out);
436 if (m->control_pressed) { break; }
438 Sequence currSeq(in); m->gobble(in);
439 out << currSeq.getName() << '\t' << it->second << endl;
441 if (nameFile != "") {
442 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
443 if (itName != nameMap.end()) {
444 vector<string> thisSeqsNames;
445 m->splitAtChar(itName->second, thisSeqsNames, ',');
446 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
447 out << thisSeqsNames[k] << '\t' << it->second << endl;
449 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
457 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
459 //output group counts
460 m->mothurOutEndLine();
462 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
463 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
464 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
466 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
468 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
470 //set fasta file as new current fastafile
472 itTypes = outputTypes.find("fasta");
473 if (itTypes != outputTypes.end()) {
474 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
477 itTypes = outputTypes.find("name");
478 if (itTypes != outputTypes.end()) {
479 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
482 itTypes = outputTypes.find("qfile");
483 if (itTypes != outputTypes.end()) {
484 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
487 itTypes = outputTypes.find("group");
488 if (itTypes != outputTypes.end()) {
489 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
492 m->mothurOutEndLine();
493 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
494 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
495 m->mothurOutEndLine();
500 catch(exception& e) {
501 m->errorOut(e, "TrimSeqsCommand", "execute");
506 /**************************************************************************************/
508 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
512 ofstream trimFASTAFile;
513 m->openOutputFile(trimFileName, trimFASTAFile);
515 ofstream scrapFASTAFile;
516 m->openOutputFile(scrapFileName, scrapFASTAFile);
518 ofstream trimQualFile;
519 ofstream scrapQualFile;
521 m->openOutputFile(trimQFileName, trimQualFile);
522 m->openOutputFile(scrapQFileName, scrapQualFile);
525 ofstream trimNameFile;
526 ofstream scrapNameFile;
528 m->openOutputFile(trimNFileName, trimNameFile);
529 m->openOutputFile(scrapNFileName, scrapNameFile);
533 ofstream outGroupsFile;
534 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
536 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
537 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
538 if (fastaFileNames[i][j] != "") {
540 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
542 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
546 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
554 m->openInputFile(filename, inFASTA);
555 inFASTA.seekg(line.start);
558 if(qFileName != "") {
559 m->openInputFile(qFileName, qFile);
560 qFile.seekg(qline.start);
565 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
569 if (m->control_pressed) {
570 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
571 if (createGroup) { outGroupsFile.close(); }
576 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
582 string trashCode = "";
583 int currentSeqsDiffs = 0;
585 Sequence currSeq(inFASTA); m->gobble(inFASTA);
586 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
587 QualityScores currQual;
589 currQual = QualityScores(qFile); m->gobble(qFile);
592 string origSeq = currSeq.getUnaligned();
595 int barcodeIndex = 0;
599 success = trimOligos.stripLinker(currSeq, currQual);
600 if(success > ldiffs) { trashCode += 'k'; }
601 else{ currentSeqsDiffs += success; }
605 if(barcodes.size() != 0){
606 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
607 if(success > bdiffs) { trashCode += 'b'; }
608 else{ currentSeqsDiffs += success; }
611 if(rbarcodes.size() != 0){
612 success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
613 if(success > bdiffs) { trashCode += 'b'; }
614 else{ currentSeqsDiffs += success; }
618 success = trimOligos.stripSpacer(currSeq, currQual);
619 if(success > sdiffs) { trashCode += 's'; }
620 else{ currentSeqsDiffs += success; }
624 if(numFPrimers != 0){
625 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
626 if(success > pdiffs) { trashCode += 'f'; }
627 else{ currentSeqsDiffs += success; }
630 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
632 if(numRPrimers != 0){
633 success = trimOligos.stripReverse(currSeq, currQual);
634 if(!success) { trashCode += 'r'; }
638 success = keepFirstTrim(currSeq, currQual);
642 success = removeLastTrim(currSeq, currQual);
643 if(!success) { trashCode += 'l'; }
648 int origLength = currSeq.getNumBases();
650 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
651 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
652 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
653 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
654 else { success = 1; }
656 //you don't want to trim, if it fails above then scrap it
657 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
659 if(!success) { trashCode += 'q'; }
662 if(minLength > 0 || maxLength > 0){
663 success = cullLength(currSeq);
664 if(!success) { trashCode += 'l'; }
667 success = cullHomoP(currSeq);
668 if(!success) { trashCode += 'h'; }
671 success = cullAmbigs(currSeq);
672 if(!success) { trashCode += 'n'; }
675 if(flip){ // should go last
676 currSeq.reverseComplement();
678 currQual.flipQScores();
682 if(trashCode.length() == 0){
683 currSeq.setAligned(currSeq.getUnaligned());
684 currSeq.printSequence(trimFASTAFile);
687 currQual.printQScores(trimQualFile);
691 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
692 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
693 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
697 if(barcodes.size() != 0){
698 string thisGroup = barcodeNameVector[barcodeIndex];
699 if (primers.size() != 0) {
700 if (primerNameVector[primerIndex] != "") {
701 if(thisGroup != "") {
702 thisGroup += "." + primerNameVector[primerIndex];
704 thisGroup = primerNameVector[primerIndex];
709 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
711 if (nameFile != "") {
712 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
713 if (itName != nameMap.end()) {
714 vector<string> thisSeqsNames;
715 m->splitAtChar(itName->second, thisSeqsNames, ',');
716 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
717 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
719 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
722 map<string, int>::iterator it = groupCounts.find(thisGroup);
723 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
724 else { groupCounts[it->first]++; }
731 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
732 currSeq.printSequence(output);
736 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
737 currQual.printQScores(output);
742 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
743 if (itName != nameMap.end()) {
744 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
745 output << itName->first << '\t' << itName->second << endl;
747 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
752 if(nameFile != ""){ //needs to be before the currSeq name is changed
753 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
754 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
755 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
757 currSeq.setName(currSeq.getName() + '|' + trashCode);
758 currSeq.setUnaligned(origSeq);
759 currSeq.setAligned(origSeq);
760 currSeq.printSequence(scrapFASTAFile);
762 currQual.printQScores(scrapQualFile);
768 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
769 unsigned long long pos = inFASTA.tellg();
770 if ((pos == -1) || (pos >= line.end)) { break; }
773 if (inFASTA.eof()) { break; }
777 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
781 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
785 trimFASTAFile.close();
786 scrapFASTAFile.close();
787 if (createGroup) { outGroupsFile.close(); }
788 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
789 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
793 catch(exception& e) {
794 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
799 /**************************************************************************************************/
801 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
808 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
809 //loop through and create all the processes you want
810 while (process != processors) {
814 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
818 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
819 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
820 vector<vector<string> > tempNameFileNames = nameFileNames;
825 for(int i=0;i<tempFASTAFileNames.size();i++){
826 for(int j=0;j<tempFASTAFileNames[i].size();j++){
827 if (tempFASTAFileNames[i][j] != "") {
828 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
829 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
832 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
833 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
836 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
837 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
844 driverCreateTrim(filename,
846 (trimFASTAFileName + toString(getpid()) + ".temp"),
847 (scrapFASTAFileName + toString(getpid()) + ".temp"),
848 (trimQualFileName + toString(getpid()) + ".temp"),
849 (scrapQualFileName + toString(getpid()) + ".temp"),
850 (trimNameFileName + toString(getpid()) + ".temp"),
851 (scrapNameFileName + toString(getpid()) + ".temp"),
852 (groupFile + toString(getpid()) + ".temp"),
854 tempPrimerQualFileNames,
859 //pass groupCounts to parent
862 string tempFile = filename + toString(getpid()) + ".num.temp";
863 m->openOutputFile(tempFile, out);
865 out << groupCounts.size() << endl;
867 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
868 out << it->first << '\t' << it->second << endl;
874 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
875 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
882 m->openOutputFile(trimFASTAFileName, temp); temp.close();
883 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
885 m->openOutputFile(trimQualFileName, temp); temp.close();
886 m->openOutputFile(scrapQualFileName, temp); temp.close();
888 if (nameFile != "") {
889 m->openOutputFile(trimNameFileName, temp); temp.close();
890 m->openOutputFile(scrapNameFileName, temp); temp.close();
893 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
895 //force parent to wait until all the processes are done
896 for (int i=0;i<processIDS.size();i++) {
897 int temp = processIDS[i];
901 //////////////////////////////////////////////////////////////////////////////////////////////////////
902 //Windows version shared memory, so be careful when passing variables through the trimData struct.
903 //Above fork() will clone, so memory is separate, but that's not the case with windows,
904 //////////////////////////////////////////////////////////////////////////////////////////////////////
906 vector<trimData*> pDataArray;
907 DWORD dwThreadIdArray[processors-1];
908 HANDLE hThreadArray[processors-1];
910 //Create processor worker threads.
911 for( int i=0; i<processors-1; i++){
913 string extension = "";
914 if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
915 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
916 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
917 vector<vector<string> > tempNameFileNames = nameFileNames;
922 for(int i=0;i<tempFASTAFileNames.size();i++){
923 for(int j=0;j<tempFASTAFileNames[i].size();j++){
924 if (tempFASTAFileNames[i][j] != "") {
925 tempFASTAFileNames[i][j] += extension;
926 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
929 tempPrimerQualFileNames[i][j] += extension;
930 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
933 tempNameFileNames[i][j] += extension;
934 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
942 trimData* tempTrim = new trimData(filename,
944 (trimFASTAFileName+extension),
945 (scrapFASTAFileName+extension),
946 (trimQualFileName+extension),
947 (scrapQualFileName+extension),
948 (trimNameFileName+extension),
949 (scrapNameFileName+extension),
950 (groupFile+extension),
952 tempPrimerQualFileNames,
954 lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
955 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
956 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
957 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
958 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
959 pDataArray.push_back(tempTrim);
961 hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
966 m->openOutputFile(trimFASTAFileName, temp); temp.close();
967 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
969 m->openOutputFile(trimQualFileName, temp); temp.close();
970 m->openOutputFile(scrapQualFileName, temp); temp.close();
972 if (nameFile != "") {
973 m->openOutputFile(trimNameFileName, temp); temp.close();
974 m->openOutputFile(scrapNameFileName, temp); temp.close();
977 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
978 processIDS.push_back(processors-1);
981 //Wait until all threads have terminated.
982 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
984 //Close all thread handles and free memory allocations.
985 for(int i=0; i < pDataArray.size(); i++){
986 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
987 map<string, int>::iterator it2 = groupCounts.find(it->first);
988 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
989 else { groupCounts[it->first] += it->second; }
991 CloseHandle(hThreadArray[i]);
992 delete pDataArray[i];
999 for(int i=0;i<processIDS.size();i++){
1001 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1003 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1004 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1005 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1006 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1008 if(qFileName != ""){
1009 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1010 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1011 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1012 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1016 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1017 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1018 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1019 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1023 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1024 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1029 for(int j=0;j<fastaFileNames.size();j++){
1030 for(int k=0;k<fastaFileNames[j].size();k++){
1031 if (fastaFileNames[j][k] != "") {
1032 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1033 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1035 if(qFileName != ""){
1036 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1037 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1041 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1042 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1049 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1052 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1053 m->openInputFile(tempFile, in);
1057 in >> tempNum; m->gobble(in);
1061 in >> group >> tempNum; m->gobble(in);
1063 map<string, int>::iterator it = groupCounts.find(group);
1064 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
1065 else { groupCounts[it->first] += tempNum; }
1068 in.close(); m->mothurRemove(tempFile);
1075 catch(exception& e) {
1076 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1081 /**************************************************************************************************/
1083 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1086 vector<unsigned long long> fastaFilePos;
1087 vector<unsigned long long> qfileFilePos;
1089 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1090 //set file positions for fasta file
1091 fastaFilePos = m->divideFile(filename, processors);
1093 //get name of first sequence in each chunk
1094 map<string, int> firstSeqNames;
1095 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1097 m->openInputFile(filename, in);
1098 in.seekg(fastaFilePos[i]);
1101 firstSeqNames[temp.getName()] = i;
1106 if(qfilename != "") {
1107 //seach for filePos of each first name in the qfile and save in qfileFilePos
1109 m->openInputFile(qfilename, inQual);
1112 while(!inQual.eof()){
1113 input = m->getline(inQual);
1115 if (input.length() != 0) {
1116 if(input[0] == '>'){ //this is a sequence name line
1117 istringstream nameStream(input);
1119 string sname = ""; nameStream >> sname;
1120 sname = sname.substr(1);
1122 map<string, int>::iterator it = firstSeqNames.find(sname);
1124 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1125 unsigned long long pos = inQual.tellg();
1126 qfileFilePos.push_back(pos - input.length() - 1);
1127 firstSeqNames.erase(it);
1132 if (firstSeqNames.size() == 0) { break; }
1137 if (firstSeqNames.size() != 0) {
1138 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1139 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1145 //get last file position of qfile
1147 unsigned long long size;
1149 //get num bytes in file
1150 pFile = fopen (qfilename.c_str(),"rb");
1151 if (pFile==NULL) perror ("Error opening file");
1153 fseek (pFile, 0, SEEK_END);
1158 qfileFilePos.push_back(size);
1161 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1162 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1163 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1165 if(qfilename == "") { qLines = lines; } //files with duds
1171 if (processors == 1) { //save time
1172 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1173 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1174 lines.push_back(linePair(0, 1000));
1175 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1177 int numFastaSeqs = 0;
1178 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1179 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1181 if (qfilename != "") {
1182 int numQualSeqs = 0;
1183 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1185 if (numFastaSeqs != numQualSeqs) {
1186 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1190 //figure out how many sequences you have to process
1191 int numSeqsPerProcessor = numFastaSeqs / processors;
1192 for (int i = 0; i < processors; i++) {
1193 int startIndex = i * numSeqsPerProcessor;
1194 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1195 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1196 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1199 if(qfilename == "") { qLines = lines; } //files with duds
1204 catch(exception& e) {
1205 m->errorOut(e, "TrimSeqsCommand", "setLines");
1210 //***************************************************************************************************************
1212 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1215 m->openInputFile(oligoFile, inOligos);
1219 string type, oligo, group;
1221 int indexPrimer = 0;
1222 int indexBarcode = 0;
1224 while(!inOligos.eof()){
1229 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1230 m->gobble(inOligos);
1233 m->gobble(inOligos);
1234 //make type case insensitive
1235 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1239 for(int i=0;i<oligo.length();i++){
1240 oligo[i] = toupper(oligo[i]);
1241 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1244 if(type == "FORWARD"){
1247 // get rest of line in case there is a primer name
1248 while (!inOligos.eof()) {
1249 char c = inOligos.get();
1250 if (c == 10 || c == 13){ break; }
1251 else if (c == 32 || c == 9){;} //space or tab
1252 else { group += c; }
1255 //check for repeat barcodes
1256 map<string, int>::iterator itPrime = primers.find(oligo);
1257 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1259 primers[oligo]=indexPrimer; indexPrimer++;
1260 primerNameVector.push_back(group);
1262 else if(type == "REVERSE"){
1263 //Sequence oligoRC("reverse", oligo);
1264 //oligoRC.reverseComplement();
1265 string oligoRC = reverseOligo(oligo);
1266 revPrimer.push_back(oligoRC);
1268 else if(type == "BARCODE"){
1271 //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
1272 //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
1274 while (!inOligos.eof()) {
1275 char c = inOligos.get();
1276 if (c == 10 || c == 13){ break; }
1277 else if (c == 32 || c == 9){;} //space or tab
1281 //then this is illumina data with 4 columns
1283 string reverseBarcode = reverseOligo(group); //reverse barcode
1286 //check for repeat barcodes
1287 map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
1288 if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); }
1290 rbarcodes[reverseBarcode]=indexBarcode;
1293 //check for repeat barcodes
1294 map<string, int>::iterator itBar = barcodes.find(oligo);
1295 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1297 barcodes[oligo]=indexBarcode; indexBarcode++;
1298 barcodeNameVector.push_back(group);
1299 }else if(type == "LINKER"){
1300 linker.push_back(oligo);
1301 }else if(type == "SPACER"){
1302 spacer.push_back(oligo);
1304 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1306 m->gobble(inOligos);
1310 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1312 //add in potential combos
1313 if(barcodeNameVector.size() == 0){
1315 barcodeNameVector.push_back("");
1318 if(primerNameVector.size() == 0){
1320 primerNameVector.push_back("");
1323 fastaFileNames.resize(barcodeNameVector.size());
1324 for(int i=0;i<fastaFileNames.size();i++){
1325 fastaFileNames[i].assign(primerNameVector.size(), "");
1327 if(qFileName != "") { qualFileNames = fastaFileNames; }
1328 if(nameFile != "") { nameFileNames = fastaFileNames; }
1331 set<string> uniqueNames; //used to cleanup outputFileNames
1332 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1333 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1335 string primerName = primerNameVector[itPrimer->second];
1336 string barcodeName = barcodeNameVector[itBar->second];
1338 string comboGroupName = "";
1339 string fastaFileName = "";
1340 string qualFileName = "";
1341 string nameFileName = "";
1343 if(primerName == ""){
1344 comboGroupName = barcodeNameVector[itBar->second];
1347 if(barcodeName == ""){
1348 comboGroupName = primerNameVector[itPrimer->second];
1351 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1357 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1358 if (uniqueNames.count(fastaFileName) == 0) {
1359 outputNames.push_back(fastaFileName);
1360 outputTypes["fasta"].push_back(fastaFileName);
1361 uniqueNames.insert(fastaFileName);
1364 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1365 m->openOutputFile(fastaFileName, temp); temp.close();
1367 if(qFileName != ""){
1368 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1369 if (uniqueNames.count(qualFileName) == 0) {
1370 outputNames.push_back(qualFileName);
1371 outputTypes["qfile"].push_back(qualFileName);
1374 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1375 m->openOutputFile(qualFileName, temp); temp.close();
1379 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1380 if (uniqueNames.count(nameFileName) == 0) {
1381 outputNames.push_back(nameFileName);
1382 outputTypes["name"].push_back(nameFileName);
1385 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1386 m->openOutputFile(nameFileName, temp); temp.close();
1392 numFPrimers = primers.size();
1393 numRPrimers = revPrimer.size();
1394 numLinkers = linker.size();
1395 numSpacers = spacer.size();
1397 bool allBlank = true;
1398 for (int i = 0; i < barcodeNameVector.size(); i++) {
1399 if (barcodeNameVector[i] != "") {
1404 for (int i = 0; i < primerNameVector.size(); i++) {
1405 if (primerNameVector[i] != "") {
1412 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1420 catch(exception& e) {
1421 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1425 //***************************************************************************************************************
1427 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1430 if(qscores.getName() != ""){
1431 qscores.trimQScores(-1, keepFirst);
1433 sequence.trim(keepFirst);
1436 catch(exception& e) {
1437 m->errorOut(e, "keepFirstTrim", "countDiffs");
1443 //***************************************************************************************************************
1445 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1449 int length = sequence.getNumBases() - removeLast;
1452 if(qscores.getName() != ""){
1453 qscores.trimQScores(-1, length);
1455 sequence.trim(length);
1464 catch(exception& e) {
1465 m->errorOut(e, "removeLastTrim", "countDiffs");
1471 //***************************************************************************************************************
1473 bool TrimSeqsCommand::cullLength(Sequence& seq){
1476 int length = seq.getNumBases();
1477 bool success = 0; //guilty until proven innocent
1479 if(length >= minLength && maxLength == 0) { success = 1; }
1480 else if(length >= minLength && length <= maxLength) { success = 1; }
1481 else { success = 0; }
1486 catch(exception& e) {
1487 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1493 //***************************************************************************************************************
1495 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1497 int longHomoP = seq.getLongHomoPolymer();
1498 bool success = 0; //guilty until proven innocent
1500 if(longHomoP <= maxHomoP){ success = 1; }
1501 else { success = 0; }
1505 catch(exception& e) {
1506 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1511 //********************************************************************/
1512 string TrimSeqsCommand::reverseOligo(string oligo){
1514 string reverse = "";
1516 for(int i=oligo.length()-1;i>=0;i--){
1518 if(oligo[i] == 'A') { reverse += 'T'; }
1519 else if(oligo[i] == 'T'){ reverse += 'A'; }
1520 else if(oligo[i] == 'U'){ reverse += 'A'; }
1522 else if(oligo[i] == 'G'){ reverse += 'C'; }
1523 else if(oligo[i] == 'C'){ reverse += 'G'; }
1525 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1526 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1528 else if(oligo[i] == 'M'){ reverse += 'K'; }
1529 else if(oligo[i] == 'K'){ reverse += 'M'; }
1531 else if(oligo[i] == 'W'){ reverse += 'W'; }
1532 else if(oligo[i] == 'S'){ reverse += 'S'; }
1534 else if(oligo[i] == 'B'){ reverse += 'V'; }
1535 else if(oligo[i] == 'V'){ reverse += 'B'; }
1537 else if(oligo[i] == 'D'){ reverse += 'H'; }
1538 else if(oligo[i] == 'H'){ reverse += 'D'; }
1540 else { reverse += 'N'; }
1546 catch(exception& e) {
1547 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1552 //***************************************************************************************************************
1554 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1556 int numNs = seq.getAmbigBases();
1557 bool success = 0; //guilty until proven innocent
1559 if(numNs <= maxAmbig) { success = 1; }
1560 else { success = 0; }
1564 catch(exception& e) {
1565 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1570 //***************************************************************************************************************