5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
14 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
27 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
28 CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
29 CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
30 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
31 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
32 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
33 CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
34 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
35 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
36 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
37 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
38 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
39 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
40 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
41 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
42 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
43 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
44 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
46 vector<string> myArray;
47 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
51 m->errorOut(e, "TrimSeqsCommand", "setParameters");
55 //**********************************************************************************************************************
56 string TrimSeqsCommand::getHelpString(){
58 string helpString = "";
59 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
60 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
61 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
62 helpString += "The fasta parameter is required.\n";
63 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
64 helpString += "The oligos parameter allows you to provide an oligos file.\n";
65 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
66 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
67 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
68 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
69 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
70 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
71 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
72 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
73 helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
74 helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
75 helpString += "The qfile parameter allows you to provide a quality file.\n";
76 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
77 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
78 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
79 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
80 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
81 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
82 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
83 helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
84 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
85 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
86 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
87 helpString += "The trim.seqs command should be in the following format: \n";
88 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
89 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
90 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
91 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
92 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
96 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
102 //**********************************************************************************************************************
104 TrimSeqsCommand::TrimSeqsCommand(){
106 abort = true; calledHelp = true;
108 vector<string> tempOutNames;
109 outputTypes["fasta"] = tempOutNames;
110 outputTypes["qfile"] = tempOutNames;
111 outputTypes["group"] = tempOutNames;
112 outputTypes["name"] = tempOutNames;
114 catch(exception& e) {
115 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
119 //***************************************************************************************************************
121 TrimSeqsCommand::TrimSeqsCommand(string option) {
124 abort = false; calledHelp = false;
127 //allow user to run help
128 if(option == "help") { help(); abort = true; calledHelp = true; }
129 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
132 vector<string> myArray = setParameters();
134 OptionParser parser(option);
135 map<string,string> parameters = parser.getParameters();
137 ValidParameters validParameter;
138 map<string,string>::iterator it;
140 //check to make sure all parameters are valid for command
141 for (it = parameters.begin(); it != parameters.end(); it++) {
142 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
145 //initialize outputTypes
146 vector<string> tempOutNames;
147 outputTypes["fasta"] = tempOutNames;
148 outputTypes["qfile"] = tempOutNames;
149 outputTypes["group"] = tempOutNames;
150 outputTypes["name"] = tempOutNames;
152 //if the user changes the input directory command factory will send this info to us in the output parameter
153 string inputDir = validParameter.validFile(parameters, "inputdir", false);
154 if (inputDir == "not found"){ inputDir = ""; }
157 it = parameters.find("fasta");
158 //user has given a template file
159 if(it != parameters.end()){
160 path = m->hasPath(it->second);
161 //if the user has not given a path then, add inputdir. else leave path alone.
162 if (path == "") { parameters["fasta"] = inputDir + it->second; }
165 it = parameters.find("oligos");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["oligos"] = inputDir + it->second; }
173 it = parameters.find("qfile");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["qfile"] = inputDir + it->second; }
181 it = parameters.find("name");
182 //user has given a template file
183 if(it != parameters.end()){
184 path = m->hasPath(it->second);
185 //if the user has not given a path then, add inputdir. else leave path alone.
186 if (path == "") { parameters["name"] = inputDir + it->second; }
192 //check for required parameters
193 fastaFile = validParameter.validFile(parameters, "fasta", true);
194 if (fastaFile == "not found") {
195 fastaFile = m->getFastaFile();
196 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
197 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
198 }else if (fastaFile == "not open") { abort = true; }
199 else { m->setFastaFile(fastaFile); }
201 //if the user changes the output directory command factory will send this info to us in the output parameter
202 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
204 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
208 //check for optional parameter and set defaults
209 // ...at some point should added some additional type checking...
211 temp = validParameter.validFile(parameters, "flip", false);
212 if (temp == "not found") { flip = 0; }
213 else { flip = m->isTrue(temp); }
215 temp = validParameter.validFile(parameters, "oligos", true);
216 if (temp == "not found"){ oligoFile = ""; }
217 else if(temp == "not open"){ abort = true; }
218 else { oligoFile = temp; m->setOligosFile(oligoFile); }
221 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
222 m->mothurConvert(temp, maxAmbig);
224 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
225 m->mothurConvert(temp, maxHomoP);
227 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
228 m->mothurConvert(temp, minLength);
230 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
231 m->mothurConvert(temp, maxLength);
233 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
234 m->mothurConvert(temp, bdiffs);
236 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
237 m->mothurConvert(temp, pdiffs);
239 temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
240 m->mothurConvert(temp, ldiffs);
242 temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
243 m->mothurConvert(temp, sdiffs);
245 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
246 m->mothurConvert(temp, tdiffs);
248 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
250 temp = validParameter.validFile(parameters, "qfile", true);
251 if (temp == "not found") { qFileName = ""; }
252 else if(temp == "not open") { abort = true; }
253 else { qFileName = temp; m->setQualFile(qFileName); }
255 temp = validParameter.validFile(parameters, "name", true);
256 if (temp == "not found") { nameFile = ""; }
257 else if(temp == "not open") { nameFile = ""; abort = true; }
258 else { nameFile = temp; m->setNameFile(nameFile); }
260 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
261 m->mothurConvert(temp, qThreshold);
263 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
264 qtrim = m->isTrue(temp);
266 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
267 convert(temp, qRollAverage);
269 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
270 convert(temp, qWindowAverage);
272 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
273 convert(temp, qWindowSize);
275 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
276 convert(temp, qWindowStep);
278 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
279 convert(temp, qAverage);
281 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
282 convert(temp, keepFirst);
284 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
285 convert(temp, removeLast);
287 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
288 allFiles = m->isTrue(temp);
290 temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
291 keepforward = m->isTrue(temp);
293 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
294 m->setProcessors(temp);
295 m->mothurConvert(temp, processors);
298 if(allFiles && (oligoFile == "")){
299 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
301 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
302 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
306 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
307 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
311 if (nameFile == "") {
312 vector<string> files; files.push_back(fastaFile);
313 parser.getNameFile(files);
318 catch(exception& e) {
319 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
323 //***************************************************************************************************************
325 int TrimSeqsCommand::execute(){
328 if (abort == true) { if (calledHelp) { return 0; } return 2; }
330 numFPrimers = 0; //this needs to be initialized
335 vector<vector<string> > fastaFileNames;
336 vector<vector<string> > qualFileNames;
337 vector<vector<string> > nameFileNames;
339 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
340 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
342 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
343 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
345 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
346 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
348 if (qFileName != "") {
349 outputNames.push_back(trimQualFile);
350 outputNames.push_back(scrapQualFile);
351 outputTypes["qfile"].push_back(trimQualFile);
352 outputTypes["qfile"].push_back(scrapQualFile);
355 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
356 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
358 if (nameFile != "") {
359 m->readNames(nameFile, nameMap);
360 outputNames.push_back(trimNameFile);
361 outputNames.push_back(scrapNameFile);
362 outputTypes["name"].push_back(trimNameFile);
363 outputTypes["name"].push_back(scrapNameFile);
366 if (m->control_pressed) { return 0; }
368 string outputGroupFileName;
370 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
372 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
373 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
377 //fills lines and qlines
378 setLines(fastaFile, qFileName);
381 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
383 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
387 if (m->control_pressed) { return 0; }
390 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
391 map<string, string>::iterator it;
392 set<string> namesToRemove;
393 for(int i=0;i<fastaFileNames.size();i++){
394 for(int j=0;j<fastaFileNames[0].size();j++){
395 if (fastaFileNames[i][j] != "") {
396 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
397 if(m->isBlank(fastaFileNames[i][j])){
398 m->mothurRemove(fastaFileNames[i][j]);
399 namesToRemove.insert(fastaFileNames[i][j]);
402 m->mothurRemove(qualFileNames[i][j]);
403 namesToRemove.insert(qualFileNames[i][j]);
407 m->mothurRemove(nameFileNames[i][j]);
408 namesToRemove.insert(nameFileNames[i][j]);
411 it = uniqueFastaNames.find(fastaFileNames[i][j]);
412 if (it == uniqueFastaNames.end()) {
413 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
421 //remove names for outputFileNames, just cleans up the output
422 vector<string> outputNames2;
423 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
424 outputNames = outputNames2;
426 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
428 m->openInputFile(it->first, in);
431 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
432 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
433 m->openOutputFile(thisGroupName, out);
436 if (m->control_pressed) { break; }
438 Sequence currSeq(in); m->gobble(in);
439 out << currSeq.getName() << '\t' << it->second << endl;
441 if (nameFile != "") {
442 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
443 if (itName != nameMap.end()) {
444 vector<string> thisSeqsNames;
445 m->splitAtChar(itName->second, thisSeqsNames, ',');
446 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
447 out << thisSeqsNames[k] << '\t' << it->second << endl;
449 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
457 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
459 //output group counts
460 m->mothurOutEndLine();
462 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
463 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
464 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
466 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
468 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
470 //set fasta file as new current fastafile
472 itTypes = outputTypes.find("fasta");
473 if (itTypes != outputTypes.end()) {
474 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
477 itTypes = outputTypes.find("name");
478 if (itTypes != outputTypes.end()) {
479 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
482 itTypes = outputTypes.find("qfile");
483 if (itTypes != outputTypes.end()) {
484 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
487 itTypes = outputTypes.find("group");
488 if (itTypes != outputTypes.end()) {
489 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
492 m->mothurOutEndLine();
493 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
494 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
495 m->mothurOutEndLine();
500 catch(exception& e) {
501 m->errorOut(e, "TrimSeqsCommand", "execute");
506 /**************************************************************************************/
508 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
512 ofstream trimFASTAFile;
513 m->openOutputFile(trimFileName, trimFASTAFile);
515 ofstream scrapFASTAFile;
516 m->openOutputFile(scrapFileName, scrapFASTAFile);
518 ofstream trimQualFile;
519 ofstream scrapQualFile;
521 m->openOutputFile(trimQFileName, trimQualFile);
522 m->openOutputFile(scrapQFileName, scrapQualFile);
525 ofstream trimNameFile;
526 ofstream scrapNameFile;
528 m->openOutputFile(trimNFileName, trimNameFile);
529 m->openOutputFile(scrapNFileName, scrapNameFile);
533 ofstream outGroupsFile;
534 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
536 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
537 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
538 if (fastaFileNames[i][j] != "") {
540 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
542 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
546 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
554 m->openInputFile(filename, inFASTA);
555 inFASTA.seekg(line.start);
558 if(qFileName != "") {
559 m->openInputFile(qFileName, qFile);
560 qFile.seekg(qline.start);
565 TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
569 if (m->control_pressed) {
570 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
571 if (createGroup) { outGroupsFile.close(); }
576 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
582 string trashCode = "";
583 int currentSeqsDiffs = 0;
585 Sequence currSeq(inFASTA); m->gobble(inFASTA);
586 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
587 QualityScores currQual;
589 currQual = QualityScores(qFile); m->gobble(qFile);
592 string origSeq = currSeq.getUnaligned();
595 int barcodeIndex = 0;
599 success = trimOligos.stripLinker(currSeq, currQual);
600 if(success > ldiffs) { trashCode += 'k'; }
601 else{ currentSeqsDiffs += success; }
605 if(barcodes.size() != 0){
606 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
607 if(success > bdiffs) { trashCode += 'b'; }
608 else{ currentSeqsDiffs += success; }
611 if(rbarcodes.size() != 0){
612 success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
613 if(success > bdiffs) { trashCode += 'b'; }
614 else{ currentSeqsDiffs += success; }
618 success = trimOligos.stripSpacer(currSeq, currQual);
619 if(success > sdiffs) { trashCode += 's'; }
620 else{ currentSeqsDiffs += success; }
624 if(numFPrimers != 0){
625 success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
626 if(success > pdiffs) { trashCode += 'f'; }
627 else{ currentSeqsDiffs += success; }
630 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
632 if(numRPrimers != 0){
633 success = trimOligos.stripReverse(currSeq, currQual);
634 if(!success) { trashCode += 'r'; }
638 success = keepFirstTrim(currSeq, currQual);
642 success = removeLastTrim(currSeq, currQual);
643 if(!success) { trashCode += 'l'; }
648 int origLength = currSeq.getNumBases();
650 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
651 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
652 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
653 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
654 else { success = 1; }
656 //you don't want to trim, if it fails above then scrap it
657 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
659 if(!success) { trashCode += 'q'; }
662 if(minLength > 0 || maxLength > 0){
663 success = cullLength(currSeq);
664 if(!success) { trashCode += 'l'; }
667 success = cullHomoP(currSeq);
668 if(!success) { trashCode += 'h'; }
671 success = cullAmbigs(currSeq);
672 if(!success) { trashCode += 'n'; }
675 if(flip){ // should go last
676 currSeq.reverseComplement();
678 currQual.flipQScores();
682 if(trashCode.length() == 0){
683 currSeq.setAligned(currSeq.getUnaligned());
684 currSeq.printSequence(trimFASTAFile);
687 currQual.printQScores(trimQualFile);
692 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
693 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
694 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
698 if(barcodes.size() != 0){
699 string thisGroup = barcodeNameVector[barcodeIndex];
700 if (primers.size() != 0) {
701 if (primerNameVector[primerIndex] != "") {
702 if(thisGroup != "") {
703 thisGroup += "." + primerNameVector[primerIndex];
705 thisGroup = primerNameVector[primerIndex];
710 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
712 int numRedundants = 0;
713 if (nameFile != "") {
714 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
715 if (itName != nameMap.end()) {
716 vector<string> thisSeqsNames;
717 m->splitAtChar(itName->second, thisSeqsNames, ',');
718 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
719 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
720 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
722 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
725 map<string, int>::iterator it = groupCounts.find(thisGroup);
726 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; }
727 else { groupCounts[it->first] += (1 + numRedundants); }
734 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
735 currSeq.printSequence(output);
739 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
740 currQual.printQScores(output);
745 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
746 if (itName != nameMap.end()) {
747 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
748 output << itName->first << '\t' << itName->second << endl;
750 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
755 if(nameFile != ""){ //needs to be before the currSeq name is changed
756 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
757 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
758 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
760 currSeq.setName(currSeq.getName() + '|' + trashCode);
761 currSeq.setUnaligned(origSeq);
762 currSeq.setAligned(origSeq);
763 currSeq.printSequence(scrapFASTAFile);
765 currQual.printQScores(scrapQualFile);
771 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
772 unsigned long long pos = inFASTA.tellg();
773 if ((pos == -1) || (pos >= line.end)) { break; }
776 if (inFASTA.eof()) { break; }
780 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
784 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
788 trimFASTAFile.close();
789 scrapFASTAFile.close();
790 if (createGroup) { outGroupsFile.close(); }
791 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
792 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
796 catch(exception& e) {
797 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
802 /**************************************************************************************************/
804 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
811 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
812 //loop through and create all the processes you want
813 while (process != processors) {
817 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
821 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
822 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
823 vector<vector<string> > tempNameFileNames = nameFileNames;
828 for(int i=0;i<tempFASTAFileNames.size();i++){
829 for(int j=0;j<tempFASTAFileNames[i].size();j++){
830 if (tempFASTAFileNames[i][j] != "") {
831 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
832 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
835 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
836 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
839 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
840 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
847 driverCreateTrim(filename,
849 (trimFASTAFileName + toString(getpid()) + ".temp"),
850 (scrapFASTAFileName + toString(getpid()) + ".temp"),
851 (trimQualFileName + toString(getpid()) + ".temp"),
852 (scrapQualFileName + toString(getpid()) + ".temp"),
853 (trimNameFileName + toString(getpid()) + ".temp"),
854 (scrapNameFileName + toString(getpid()) + ".temp"),
855 (groupFile + toString(getpid()) + ".temp"),
857 tempPrimerQualFileNames,
862 //pass groupCounts to parent
865 string tempFile = filename + toString(getpid()) + ".num.temp";
866 m->openOutputFile(tempFile, out);
868 out << groupCounts.size() << endl;
870 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
871 out << it->first << '\t' << it->second << endl;
877 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
878 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
885 m->openOutputFile(trimFASTAFileName, temp); temp.close();
886 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
888 m->openOutputFile(trimQualFileName, temp); temp.close();
889 m->openOutputFile(scrapQualFileName, temp); temp.close();
891 if (nameFile != "") {
892 m->openOutputFile(trimNameFileName, temp); temp.close();
893 m->openOutputFile(scrapNameFileName, temp); temp.close();
896 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
898 //force parent to wait until all the processes are done
899 for (int i=0;i<processIDS.size();i++) {
900 int temp = processIDS[i];
904 //////////////////////////////////////////////////////////////////////////////////////////////////////
905 //Windows version shared memory, so be careful when passing variables through the trimData struct.
906 //Above fork() will clone, so memory is separate, but that's not the case with windows,
907 //////////////////////////////////////////////////////////////////////////////////////////////////////
909 vector<trimData*> pDataArray;
910 DWORD dwThreadIdArray[processors-1];
911 HANDLE hThreadArray[processors-1];
913 //Create processor worker threads.
914 for( int i=0; i<processors-1; i++){
916 string extension = "";
917 if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
918 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
919 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
920 vector<vector<string> > tempNameFileNames = nameFileNames;
925 for(int i=0;i<tempFASTAFileNames.size();i++){
926 for(int j=0;j<tempFASTAFileNames[i].size();j++){
927 if (tempFASTAFileNames[i][j] != "") {
928 tempFASTAFileNames[i][j] += extension;
929 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
932 tempPrimerQualFileNames[i][j] += extension;
933 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
936 tempNameFileNames[i][j] += extension;
937 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
945 trimData* tempTrim = new trimData(filename,
947 (trimFASTAFileName+extension),
948 (scrapFASTAFileName+extension),
949 (trimQualFileName+extension),
950 (scrapQualFileName+extension),
951 (trimNameFileName+extension),
952 (scrapNameFileName+extension),
953 (groupFile+extension),
955 tempPrimerQualFileNames,
957 lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
958 pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer,
959 primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
960 qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
961 minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
962 pDataArray.push_back(tempTrim);
964 hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
969 m->openOutputFile(trimFASTAFileName, temp); temp.close();
970 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
972 m->openOutputFile(trimQualFileName, temp); temp.close();
973 m->openOutputFile(scrapQualFileName, temp); temp.close();
975 if (nameFile != "") {
976 m->openOutputFile(trimNameFileName, temp); temp.close();
977 m->openOutputFile(scrapNameFileName, temp); temp.close();
980 driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
981 processIDS.push_back(processors-1);
984 //Wait until all threads have terminated.
985 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
987 //Close all thread handles and free memory allocations.
988 for(int i=0; i < pDataArray.size(); i++){
989 for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
990 map<string, int>::iterator it2 = groupCounts.find(it->first);
991 if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
992 else { groupCounts[it->first] += it->second; }
994 CloseHandle(hThreadArray[i]);
995 delete pDataArray[i];
1002 for(int i=0;i<processIDS.size();i++){
1004 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
1006 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
1007 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
1008 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
1009 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
1011 if(qFileName != ""){
1012 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
1013 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
1014 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
1015 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
1019 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
1020 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
1021 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
1022 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
1026 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
1027 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
1032 for(int j=0;j<fastaFileNames.size();j++){
1033 for(int k=0;k<fastaFileNames[j].size();k++){
1034 if (fastaFileNames[j][k] != "") {
1035 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
1036 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1038 if(qFileName != ""){
1039 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
1040 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1044 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
1045 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
1052 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1055 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
1056 m->openInputFile(tempFile, in);
1060 in >> tempNum; m->gobble(in);
1064 in >> group >> tempNum; m->gobble(in);
1066 map<string, int>::iterator it = groupCounts.find(group);
1067 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
1068 else { groupCounts[it->first] += tempNum; }
1071 in.close(); m->mothurRemove(tempFile);
1078 catch(exception& e) {
1079 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
1084 /**************************************************************************************************/
1086 int TrimSeqsCommand::setLines(string filename, string qfilename) {
1089 vector<unsigned long long> fastaFilePos;
1090 vector<unsigned long long> qfileFilePos;
1092 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1093 //set file positions for fasta file
1094 fastaFilePos = m->divideFile(filename, processors);
1096 //get name of first sequence in each chunk
1097 map<string, int> firstSeqNames;
1098 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1100 m->openInputFile(filename, in);
1101 in.seekg(fastaFilePos[i]);
1104 firstSeqNames[temp.getName()] = i;
1109 if(qfilename != "") {
1110 //seach for filePos of each first name in the qfile and save in qfileFilePos
1112 m->openInputFile(qfilename, inQual);
1115 while(!inQual.eof()){
1116 input = m->getline(inQual);
1118 if (input.length() != 0) {
1119 if(input[0] == '>'){ //this is a sequence name line
1120 istringstream nameStream(input);
1122 string sname = ""; nameStream >> sname;
1123 sname = sname.substr(1);
1125 map<string, int>::iterator it = firstSeqNames.find(sname);
1127 if(it != firstSeqNames.end()) { //this is the start of a new chunk
1128 unsigned long long pos = inQual.tellg();
1129 qfileFilePos.push_back(pos - input.length() - 1);
1130 firstSeqNames.erase(it);
1135 if (firstSeqNames.size() == 0) { break; }
1140 if (firstSeqNames.size() != 0) {
1141 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1142 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1148 //get last file position of qfile
1150 unsigned long long size;
1152 //get num bytes in file
1153 pFile = fopen (qfilename.c_str(),"rb");
1154 if (pFile==NULL) perror ("Error opening file");
1156 fseek (pFile, 0, SEEK_END);
1161 qfileFilePos.push_back(size);
1164 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
1165 lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
1166 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
1168 if(qfilename == "") { qLines = lines; } //files with duds
1174 if (processors == 1) { //save time
1175 //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1176 //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1177 lines.push_back(linePair(0, 1000));
1178 if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
1180 int numFastaSeqs = 0;
1181 fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
1182 if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
1184 if (qfilename != "") {
1185 int numQualSeqs = 0;
1186 qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
1188 if (numFastaSeqs != numQualSeqs) {
1189 m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
1193 //figure out how many sequences you have to process
1194 int numSeqsPerProcessor = numFastaSeqs / processors;
1195 for (int i = 0; i < processors; i++) {
1196 int startIndex = i * numSeqsPerProcessor;
1197 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
1198 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
1199 if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
1202 if(qfilename == "") { qLines = lines; } //files with duds
1207 catch(exception& e) {
1208 m->errorOut(e, "TrimSeqsCommand", "setLines");
1213 //***************************************************************************************************************
1215 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1218 m->openInputFile(oligoFile, inOligos);
1222 string type, oligo, group;
1224 int indexPrimer = 0;
1225 int indexBarcode = 0;
1227 while(!inOligos.eof()){
1232 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1233 m->gobble(inOligos);
1236 m->gobble(inOligos);
1237 //make type case insensitive
1238 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1242 for(int i=0;i<oligo.length();i++){
1243 oligo[i] = toupper(oligo[i]);
1244 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1247 if(type == "FORWARD"){
1250 // get rest of line in case there is a primer name
1251 while (!inOligos.eof()) {
1252 char c = inOligos.get();
1253 if (c == 10 || c == 13){ break; }
1254 else if (c == 32 || c == 9){;} //space or tab
1255 else { group += c; }
1258 //check for repeat barcodes
1259 map<string, int>::iterator itPrime = primers.find(oligo);
1260 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1262 primers[oligo]=indexPrimer; indexPrimer++;
1263 primerNameVector.push_back(group);
1265 else if(type == "REVERSE"){
1266 //Sequence oligoRC("reverse", oligo);
1267 //oligoRC.reverseComplement();
1268 string oligoRC = reverseOligo(oligo);
1269 revPrimer.push_back(oligoRC);
1271 else if(type == "BARCODE"){
1274 //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs
1275 //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info
1277 while (!inOligos.eof()) {
1278 char c = inOligos.get();
1279 if (c == 10 || c == 13){ break; }
1280 else if (c == 32 || c == 9){;} //space or tab
1284 //then this is illumina data with 4 columns
1286 string reverseBarcode = reverseOligo(group); //reverse barcode
1289 //check for repeat barcodes
1290 map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
1291 if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); }
1293 rbarcodes[reverseBarcode]=indexBarcode;
1296 //check for repeat barcodes
1297 map<string, int>::iterator itBar = barcodes.find(oligo);
1298 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1300 barcodes[oligo]=indexBarcode; indexBarcode++;
1301 barcodeNameVector.push_back(group);
1302 }else if(type == "LINKER"){
1303 linker.push_back(oligo);
1304 }else if(type == "SPACER"){
1305 spacer.push_back(oligo);
1307 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1309 m->gobble(inOligos);
1313 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1315 //add in potential combos
1316 if(barcodeNameVector.size() == 0){
1318 barcodeNameVector.push_back("");
1321 if(primerNameVector.size() == 0){
1323 primerNameVector.push_back("");
1326 fastaFileNames.resize(barcodeNameVector.size());
1327 for(int i=0;i<fastaFileNames.size();i++){
1328 fastaFileNames[i].assign(primerNameVector.size(), "");
1330 if(qFileName != "") { qualFileNames = fastaFileNames; }
1331 if(nameFile != "") { nameFileNames = fastaFileNames; }
1334 set<string> uniqueNames; //used to cleanup outputFileNames
1335 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1336 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1338 string primerName = primerNameVector[itPrimer->second];
1339 string barcodeName = barcodeNameVector[itBar->second];
1341 string comboGroupName = "";
1342 string fastaFileName = "";
1343 string qualFileName = "";
1344 string nameFileName = "";
1346 if(primerName == ""){
1347 comboGroupName = barcodeNameVector[itBar->second];
1350 if(barcodeName == ""){
1351 comboGroupName = primerNameVector[itPrimer->second];
1354 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1360 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1361 if (uniqueNames.count(fastaFileName) == 0) {
1362 outputNames.push_back(fastaFileName);
1363 outputTypes["fasta"].push_back(fastaFileName);
1364 uniqueNames.insert(fastaFileName);
1367 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1368 m->openOutputFile(fastaFileName, temp); temp.close();
1370 if(qFileName != ""){
1371 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1372 if (uniqueNames.count(qualFileName) == 0) {
1373 outputNames.push_back(qualFileName);
1374 outputTypes["qfile"].push_back(qualFileName);
1377 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1378 m->openOutputFile(qualFileName, temp); temp.close();
1382 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1383 if (uniqueNames.count(nameFileName) == 0) {
1384 outputNames.push_back(nameFileName);
1385 outputTypes["name"].push_back(nameFileName);
1388 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1389 m->openOutputFile(nameFileName, temp); temp.close();
1395 numFPrimers = primers.size();
1396 numRPrimers = revPrimer.size();
1397 numLinkers = linker.size();
1398 numSpacers = spacer.size();
1400 bool allBlank = true;
1401 for (int i = 0; i < barcodeNameVector.size(); i++) {
1402 if (barcodeNameVector[i] != "") {
1407 for (int i = 0; i < primerNameVector.size(); i++) {
1408 if (primerNameVector[i] != "") {
1415 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1423 catch(exception& e) {
1424 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1428 //***************************************************************************************************************
1430 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1433 if(qscores.getName() != ""){
1434 qscores.trimQScores(-1, keepFirst);
1436 sequence.trim(keepFirst);
1439 catch(exception& e) {
1440 m->errorOut(e, "keepFirstTrim", "countDiffs");
1446 //***************************************************************************************************************
1448 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1452 int length = sequence.getNumBases() - removeLast;
1455 if(qscores.getName() != ""){
1456 qscores.trimQScores(-1, length);
1458 sequence.trim(length);
1467 catch(exception& e) {
1468 m->errorOut(e, "removeLastTrim", "countDiffs");
1474 //***************************************************************************************************************
1476 bool TrimSeqsCommand::cullLength(Sequence& seq){
1479 int length = seq.getNumBases();
1480 bool success = 0; //guilty until proven innocent
1482 if(length >= minLength && maxLength == 0) { success = 1; }
1483 else if(length >= minLength && length <= maxLength) { success = 1; }
1484 else { success = 0; }
1489 catch(exception& e) {
1490 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1496 //***************************************************************************************************************
1498 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1500 int longHomoP = seq.getLongHomoPolymer();
1501 bool success = 0; //guilty until proven innocent
1503 if(longHomoP <= maxHomoP){ success = 1; }
1504 else { success = 0; }
1508 catch(exception& e) {
1509 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1514 //********************************************************************/
1515 string TrimSeqsCommand::reverseOligo(string oligo){
1517 string reverse = "";
1519 for(int i=oligo.length()-1;i>=0;i--){
1521 if(oligo[i] == 'A') { reverse += 'T'; }
1522 else if(oligo[i] == 'T'){ reverse += 'A'; }
1523 else if(oligo[i] == 'U'){ reverse += 'A'; }
1525 else if(oligo[i] == 'G'){ reverse += 'C'; }
1526 else if(oligo[i] == 'C'){ reverse += 'G'; }
1528 else if(oligo[i] == 'R'){ reverse += 'Y'; }
1529 else if(oligo[i] == 'Y'){ reverse += 'R'; }
1531 else if(oligo[i] == 'M'){ reverse += 'K'; }
1532 else if(oligo[i] == 'K'){ reverse += 'M'; }
1534 else if(oligo[i] == 'W'){ reverse += 'W'; }
1535 else if(oligo[i] == 'S'){ reverse += 'S'; }
1537 else if(oligo[i] == 'B'){ reverse += 'V'; }
1538 else if(oligo[i] == 'V'){ reverse += 'B'; }
1540 else if(oligo[i] == 'D'){ reverse += 'H'; }
1541 else if(oligo[i] == 'H'){ reverse += 'D'; }
1543 else { reverse += 'N'; }
1549 catch(exception& e) {
1550 m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
1555 //***************************************************************************************************************
1557 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1559 int numNs = seq.getAmbigBases();
1560 bool success = 0; //guilty until proven innocent
1562 if(numNs <= maxAmbig) { success = 1; }
1563 else { success = 0; }
1567 catch(exception& e) {
1568 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1573 //***************************************************************************************************************