5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
14 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
27 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
28 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
29 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
30 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
31 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
32 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
33 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
34 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
35 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
36 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
37 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
38 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
39 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
40 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
41 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
43 vector<string> myArray;
44 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
48 m->errorOut(e, "TrimSeqsCommand", "setParameters");
52 //**********************************************************************************************************************
53 string TrimSeqsCommand::getHelpString(){
55 string helpString = "";
56 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
57 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
58 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
59 helpString += "The fasta parameter is required.\n";
60 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
61 helpString += "The oligos parameter allows you to provide an oligos file.\n";
62 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
63 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
64 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
65 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
66 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
67 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
68 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
69 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
70 helpString += "The qfile parameter allows you to provide a quality file.\n";
71 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
72 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
73 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
74 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
75 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
76 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
77 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
78 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
79 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
80 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
81 helpString += "The trim.seqs command should be in the following format: \n";
82 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
83 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
84 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
85 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
86 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
90 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
96 //**********************************************************************************************************************
98 TrimSeqsCommand::TrimSeqsCommand(){
100 abort = true; calledHelp = true;
102 vector<string> tempOutNames;
103 outputTypes["fasta"] = tempOutNames;
104 outputTypes["qfile"] = tempOutNames;
105 outputTypes["group"] = tempOutNames;
106 outputTypes["name"] = tempOutNames;
108 catch(exception& e) {
109 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
113 //***************************************************************************************************************
115 TrimSeqsCommand::TrimSeqsCommand(string option) {
118 abort = false; calledHelp = false;
121 //allow user to run help
122 if(option == "help") { help(); abort = true; calledHelp = true; }
123 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
126 vector<string> myArray = setParameters();
128 OptionParser parser(option);
129 map<string,string> parameters = parser.getParameters();
131 ValidParameters validParameter;
132 map<string,string>::iterator it;
134 //check to make sure all parameters are valid for command
135 for (it = parameters.begin(); it != parameters.end(); it++) {
136 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
139 //initialize outputTypes
140 vector<string> tempOutNames;
141 outputTypes["fasta"] = tempOutNames;
142 outputTypes["qfile"] = tempOutNames;
143 outputTypes["group"] = tempOutNames;
144 outputTypes["name"] = tempOutNames;
146 //if the user changes the input directory command factory will send this info to us in the output parameter
147 string inputDir = validParameter.validFile(parameters, "inputdir", false);
148 if (inputDir == "not found"){ inputDir = ""; }
151 it = parameters.find("fasta");
152 //user has given a template file
153 if(it != parameters.end()){
154 path = m->hasPath(it->second);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { parameters["fasta"] = inputDir + it->second; }
159 it = parameters.find("oligos");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["oligos"] = inputDir + it->second; }
167 it = parameters.find("qfile");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["qfile"] = inputDir + it->second; }
175 it = parameters.find("name");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["name"] = inputDir + it->second; }
186 //check for required parameters
187 fastaFile = validParameter.validFile(parameters, "fasta", true);
188 if (fastaFile == "not found") {
189 fastaFile = m->getFastaFile();
190 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
191 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
192 }else if (fastaFile == "not open") { abort = true; }
193 else { m->setFastaFile(fastaFile); }
195 //if the user changes the output directory command factory will send this info to us in the output parameter
196 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
198 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
202 //check for optional parameter and set defaults
203 // ...at some point should added some additional type checking...
205 temp = validParameter.validFile(parameters, "flip", false);
206 if (temp == "not found") { flip = 0; }
207 else { flip = m->isTrue(temp); }
209 temp = validParameter.validFile(parameters, "oligos", true);
210 if (temp == "not found"){ oligoFile = ""; }
211 else if(temp == "not open"){ abort = true; }
212 else { oligoFile = temp; m->setOligosFile(oligoFile); }
215 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
216 m->mothurConvert(temp, maxAmbig);
218 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
219 m->mothurConvert(temp, maxHomoP);
221 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
222 m->mothurConvert(temp, minLength);
224 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
225 m->mothurConvert(temp, maxLength);
227 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
228 m->mothurConvert(temp, bdiffs);
230 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
231 m->mothurConvert(temp, pdiffs);
233 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
234 m->mothurConvert(temp, tdiffs);
236 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
238 temp = validParameter.validFile(parameters, "qfile", true);
239 if (temp == "not found") { qFileName = ""; }
240 else if(temp == "not open") { abort = true; }
241 else { qFileName = temp; m->setQualFile(qFileName); }
243 temp = validParameter.validFile(parameters, "name", true);
244 if (temp == "not found") { nameFile = ""; }
245 else if(temp == "not open") { nameFile = ""; abort = true; }
246 else { nameFile = temp; m->setNameFile(nameFile); }
248 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
249 m->mothurConvert(temp, qThreshold);
251 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
252 qtrim = m->isTrue(temp);
254 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
255 convert(temp, qRollAverage);
257 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
258 convert(temp, qWindowAverage);
260 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
261 convert(temp, qWindowSize);
263 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
264 convert(temp, qWindowStep);
266 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
267 convert(temp, qAverage);
269 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
270 convert(temp, keepFirst);
272 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
273 convert(temp, removeLast);
275 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
276 allFiles = m->isTrue(temp);
278 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
279 m->setProcessors(temp);
280 m->mothurConvert(temp, processors);
283 if(allFiles && (oligoFile == "")){
284 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
286 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
287 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
291 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
292 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
296 if (nameFile == "") {
297 vector<string> files; files.push_back(fastaFile);
298 parser.getNameFile(files);
303 catch(exception& e) {
304 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
308 //***************************************************************************************************************
310 int TrimSeqsCommand::execute(){
313 if (abort == true) { if (calledHelp) { return 0; } return 2; }
315 numFPrimers = 0; //this needs to be initialized
318 vector<vector<string> > fastaFileNames;
319 vector<vector<string> > qualFileNames;
320 vector<vector<string> > nameFileNames;
322 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
323 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
325 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
326 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
328 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
329 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
331 if (qFileName != "") {
332 outputNames.push_back(trimQualFile);
333 outputNames.push_back(scrapQualFile);
334 outputTypes["qfile"].push_back(trimQualFile);
335 outputTypes["qfile"].push_back(scrapQualFile);
338 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
339 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
341 if (nameFile != "") {
342 m->readNames(nameFile, nameMap);
343 outputNames.push_back(trimNameFile);
344 outputNames.push_back(scrapNameFile);
345 outputTypes["name"].push_back(trimNameFile);
346 outputTypes["name"].push_back(scrapNameFile);
349 if (m->control_pressed) { return 0; }
351 string outputGroupFileName;
353 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
355 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
356 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
360 vector<unsigned long long> fastaFilePos;
361 vector<unsigned long long> qFilePos;
363 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
365 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
366 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
367 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
369 if(qFileName == "") { qLines = lines; } //files with duds
371 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
373 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
375 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
378 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
381 if (m->control_pressed) { return 0; }
384 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
385 map<string, string>::iterator it;
386 set<string> namesToRemove;
387 for(int i=0;i<fastaFileNames.size();i++){
388 for(int j=0;j<fastaFileNames[0].size();j++){
389 if (fastaFileNames[i][j] != "") {
390 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
391 if(m->isBlank(fastaFileNames[i][j])){
392 m->mothurRemove(fastaFileNames[i][j]);
393 namesToRemove.insert(fastaFileNames[i][j]);
396 m->mothurRemove(qualFileNames[i][j]);
397 namesToRemove.insert(qualFileNames[i][j]);
401 m->mothurRemove(nameFileNames[i][j]);
402 namesToRemove.insert(nameFileNames[i][j]);
405 it = uniqueFastaNames.find(fastaFileNames[i][j]);
406 if (it == uniqueFastaNames.end()) {
407 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
415 //remove names for outputFileNames, just cleans up the output
416 vector<string> outputNames2;
417 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
418 outputNames = outputNames2;
420 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
422 m->openInputFile(it->first, in);
425 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
426 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
427 m->openOutputFile(thisGroupName, out);
430 if (m->control_pressed) { break; }
432 Sequence currSeq(in); m->gobble(in);
433 out << currSeq.getName() << '\t' << it->second << endl;
440 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
442 //output group counts
443 m->mothurOutEndLine();
445 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
446 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
447 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
449 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
451 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
453 //set fasta file as new current fastafile
455 itTypes = outputTypes.find("fasta");
456 if (itTypes != outputTypes.end()) {
457 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
460 itTypes = outputTypes.find("name");
461 if (itTypes != outputTypes.end()) {
462 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
465 itTypes = outputTypes.find("qfile");
466 if (itTypes != outputTypes.end()) {
467 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
470 itTypes = outputTypes.find("group");
471 if (itTypes != outputTypes.end()) {
472 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
475 m->mothurOutEndLine();
476 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
477 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
478 m->mothurOutEndLine();
483 catch(exception& e) {
484 m->errorOut(e, "TrimSeqsCommand", "execute");
489 /**************************************************************************************/
491 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
495 ofstream trimFASTAFile;
496 m->openOutputFile(trimFileName, trimFASTAFile);
498 ofstream scrapFASTAFile;
499 m->openOutputFile(scrapFileName, scrapFASTAFile);
501 ofstream trimQualFile;
502 ofstream scrapQualFile;
504 m->openOutputFile(trimQFileName, trimQualFile);
505 m->openOutputFile(scrapQFileName, scrapQualFile);
508 ofstream trimNameFile;
509 ofstream scrapNameFile;
511 m->openOutputFile(trimNFileName, trimNameFile);
512 m->openOutputFile(scrapNFileName, scrapNameFile);
516 ofstream outGroupsFile;
517 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
519 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
520 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
521 if (fastaFileNames[i][j] != "") {
523 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
525 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
529 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
537 m->openInputFile(filename, inFASTA);
538 inFASTA.seekg(line->start);
541 if(qFileName != "") {
542 m->openInputFile(qFileName, qFile);
543 qFile.seekg(qline->start);
548 TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
552 if (m->control_pressed) {
553 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
554 if (createGroup) { outGroupsFile.close(); }
559 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
565 string trashCode = "";
566 int currentSeqsDiffs = 0;
568 Sequence currSeq(inFASTA); m->gobble(inFASTA);
569 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
570 QualityScores currQual;
572 currQual = QualityScores(qFile); m->gobble(qFile);
575 string origSeq = currSeq.getUnaligned();
578 int barcodeIndex = 0;
581 if(barcodes.size() != 0){
582 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
583 if(success > bdiffs) { trashCode += 'b'; }
584 else{ currentSeqsDiffs += success; }
587 if(numFPrimers != 0){
588 success = trimOligos.stripForward(currSeq, currQual, primerIndex);
589 if(success > pdiffs) { trashCode += 'f'; }
590 else{ currentSeqsDiffs += success; }
593 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
595 if(numRPrimers != 0){
596 success = trimOligos.stripReverse(currSeq, currQual);
597 if(!success) { trashCode += 'r'; }
601 success = keepFirstTrim(currSeq, currQual);
605 success = removeLastTrim(currSeq, currQual);
606 if(!success) { trashCode += 'l'; }
611 int origLength = currSeq.getNumBases();
613 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
614 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
615 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
616 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
617 else { success = 1; }
619 //you don't want to trim, if it fails above then scrap it
620 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
622 if(!success) { trashCode += 'q'; }
625 if(minLength > 0 || maxLength > 0){
626 success = cullLength(currSeq);
627 if(!success) { trashCode += 'l'; }
630 success = cullHomoP(currSeq);
631 if(!success) { trashCode += 'h'; }
634 success = cullAmbigs(currSeq);
635 if(!success) { trashCode += 'n'; }
638 if(flip){ // should go last
639 currSeq.reverseComplement();
641 currQual.flipQScores();
645 if(trashCode.length() == 0){
646 currSeq.setAligned(currSeq.getUnaligned());
647 currSeq.printSequence(trimFASTAFile);
650 currQual.printQScores(trimQualFile);
654 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
655 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
656 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
660 if(barcodes.size() != 0){
661 string thisGroup = barcodeNameVector[barcodeIndex];
662 if (primers.size() != 0) {
663 if (primerNameVector[primerIndex] != "") {
664 if(thisGroup != "") {
665 thisGroup += "." + primerNameVector[primerIndex];
667 thisGroup = primerNameVector[primerIndex];
672 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
674 if (nameFile != "") {
675 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
676 if (itName != nameMap.end()) {
677 vector<string> thisSeqsNames;
678 m->splitAtChar(itName->second, thisSeqsNames, ',');
679 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
680 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
682 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
685 map<string, int>::iterator it = groupCounts.find(thisGroup);
686 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
687 else { groupCounts[it->first]++; }
694 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
695 currSeq.printSequence(output);
699 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
700 currQual.printQScores(output);
705 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
706 if (itName != nameMap.end()) {
707 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
708 output << itName->first << '\t' << itName->second << endl;
710 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
715 if(nameFile != ""){ //needs to be before the currSeq name is changed
716 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
717 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
718 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
720 currSeq.setName(currSeq.getName() + '|' + trashCode);
721 currSeq.setUnaligned(origSeq);
722 currSeq.setAligned(origSeq);
723 currSeq.printSequence(scrapFASTAFile);
725 currQual.printQScores(scrapQualFile);
731 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
732 unsigned long long pos = inFASTA.tellg();
733 if ((pos == -1) || (pos >= line->end)) { break; }
736 if (inFASTA.eof()) { break; }
740 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
744 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
748 trimFASTAFile.close();
749 scrapFASTAFile.close();
750 if (createGroup) { outGroupsFile.close(); }
751 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
752 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
756 catch(exception& e) {
757 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
762 /**************************************************************************************************/
764 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
766 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
771 //loop through and create all the processes you want
772 while (process != processors) {
776 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
780 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
781 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
782 vector<vector<string> > tempNameFileNames = nameFileNames;
787 for(int i=0;i<tempFASTAFileNames.size();i++){
788 for(int j=0;j<tempFASTAFileNames[i].size();j++){
789 if (tempFASTAFileNames[i][j] != "") {
790 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
791 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
794 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
795 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
798 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
799 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
806 driverCreateTrim(filename,
808 (trimFASTAFileName + toString(getpid()) + ".temp"),
809 (scrapFASTAFileName + toString(getpid()) + ".temp"),
810 (trimQualFileName + toString(getpid()) + ".temp"),
811 (scrapQualFileName + toString(getpid()) + ".temp"),
812 (trimNameFileName + toString(getpid()) + ".temp"),
813 (scrapNameFileName + toString(getpid()) + ".temp"),
814 (groupFile + toString(getpid()) + ".temp"),
816 tempPrimerQualFileNames,
821 //pass groupCounts to parent
824 string tempFile = filename + toString(getpid()) + ".num.temp";
825 m->openOutputFile(tempFile, out);
827 out << groupCounts.size() << endl;
829 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
830 out << it->first << '\t' << it->second << endl;
836 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
837 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
844 m->openOutputFile(trimFASTAFileName, temp); temp.close();
845 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
847 m->openOutputFile(trimQualFileName, temp); temp.close();
848 m->openOutputFile(scrapQualFileName, temp); temp.close();
850 if (nameFile != "") {
851 m->openOutputFile(trimNameFileName, temp); temp.close();
852 m->openOutputFile(scrapNameFileName, temp); temp.close();
855 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
857 //force parent to wait until all the processes are done
858 for (int i=0;i<processIDS.size();i++) {
859 int temp = processIDS[i];
864 for(int i=0;i<processIDS.size();i++){
866 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
868 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
869 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
870 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
871 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
874 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
875 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
876 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
877 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
881 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
882 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
883 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
884 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
888 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
889 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
894 for(int j=0;j<fastaFileNames.size();j++){
895 for(int k=0;k<fastaFileNames[j].size();k++){
896 if (fastaFileNames[j][k] != "") {
897 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
898 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
901 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
902 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
906 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
907 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
916 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
917 m->openInputFile(tempFile, in);
921 in >> tempNum; m->gobble(in);
925 in >> group >> tempNum; m->gobble(in);
927 map<string, int>::iterator it = groupCounts.find(group);
928 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
929 else { groupCounts[it->first] += tempNum; }
932 in.close(); m->mothurRemove(tempFile);
940 catch(exception& e) {
941 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
946 /**************************************************************************************************/
948 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
950 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
951 //set file positions for fasta file
952 fastaFilePos = m->divideFile(filename, processors);
954 if (qfilename == "") { return processors; }
956 //get name of first sequence in each chunk
957 map<string, int> firstSeqNames;
958 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
960 m->openInputFile(filename, in);
961 in.seekg(fastaFilePos[i]);
964 firstSeqNames[temp.getName()] = i;
969 //seach for filePos of each first name in the qfile and save in qfileFilePos
971 m->openInputFile(qfilename, inQual);
974 while(!inQual.eof()){
975 input = m->getline(inQual);
977 if (input.length() != 0) {
978 if(input[0] == '>'){ //this is a sequence name line
979 istringstream nameStream(input);
981 string sname = ""; nameStream >> sname;
982 sname = sname.substr(1);
984 map<string, int>::iterator it = firstSeqNames.find(sname);
986 if(it != firstSeqNames.end()) { //this is the start of a new chunk
987 unsigned long long pos = inQual.tellg();
988 qfileFilePos.push_back(pos - input.length() - 1);
989 firstSeqNames.erase(it);
994 if (firstSeqNames.size() == 0) { break; }
999 if (firstSeqNames.size() != 0) {
1000 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
1001 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1007 //get last file position of qfile
1009 unsigned long long size;
1011 //get num bytes in file
1012 pFile = fopen (qfilename.c_str(),"rb");
1013 if (pFile==NULL) perror ("Error opening file");
1015 fseek (pFile, 0, SEEK_END);
1020 qfileFilePos.push_back(size);
1026 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1027 fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1032 catch(exception& e) {
1033 m->errorOut(e, "TrimSeqsCommand", "setLines");
1038 //***************************************************************************************************************
1040 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1043 m->openInputFile(oligoFile, inOligos);
1047 string type, oligo, group;
1049 int indexPrimer = 0;
1050 int indexBarcode = 0;
1052 while(!inOligos.eof()){
1057 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1058 m->gobble(inOligos);
1061 m->gobble(inOligos);
1062 //make type case insensitive
1063 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1067 for(int i=0;i<oligo.length();i++){
1068 oligo[i] = toupper(oligo[i]);
1069 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1072 if(type == "FORWARD"){
1075 // get rest of line in case there is a primer name
1076 while (!inOligos.eof()) {
1077 char c = inOligos.get();
1078 if (c == 10 || c == 13){ break; }
1079 else if (c == 32 || c == 9){;} //space or tab
1080 else { group += c; }
1083 //check for repeat barcodes
1084 map<string, int>::iterator itPrime = primers.find(oligo);
1085 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1087 primers[oligo]=indexPrimer; indexPrimer++;
1088 primerNameVector.push_back(group);
1090 else if(type == "REVERSE"){
1091 Sequence oligoRC("reverse", oligo);
1092 oligoRC.reverseComplement();
1093 revPrimer.push_back(oligoRC.getUnaligned());
1095 else if(type == "BARCODE"){
1098 //check for repeat barcodes
1099 map<string, int>::iterator itBar = barcodes.find(oligo);
1100 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1102 barcodes[oligo]=indexBarcode; indexBarcode++;
1103 barcodeNameVector.push_back(group);
1105 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1107 m->gobble(inOligos);
1111 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1113 //add in potential combos
1114 if(barcodeNameVector.size() == 0){
1116 barcodeNameVector.push_back("");
1119 if(primerNameVector.size() == 0){
1121 primerNameVector.push_back("");
1124 fastaFileNames.resize(barcodeNameVector.size());
1125 for(int i=0;i<fastaFileNames.size();i++){
1126 fastaFileNames[i].assign(primerNameVector.size(), "");
1128 if(qFileName != "") { qualFileNames = fastaFileNames; }
1129 if(nameFile != "") { nameFileNames = fastaFileNames; }
1132 set<string> uniqueNames; //used to cleanup outputFileNames
1133 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1134 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1136 string primerName = primerNameVector[itPrimer->second];
1137 string barcodeName = barcodeNameVector[itBar->second];
1139 string comboGroupName = "";
1140 string fastaFileName = "";
1141 string qualFileName = "";
1142 string nameFileName = "";
1144 if(primerName == ""){
1145 comboGroupName = barcodeNameVector[itBar->second];
1148 if(barcodeName == ""){
1149 comboGroupName = primerNameVector[itPrimer->second];
1152 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1158 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1159 if (uniqueNames.count(fastaFileName) == 0) {
1160 outputNames.push_back(fastaFileName);
1161 outputTypes["fasta"].push_back(fastaFileName);
1162 uniqueNames.insert(fastaFileName);
1165 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1166 m->openOutputFile(fastaFileName, temp); temp.close();
1168 if(qFileName != ""){
1169 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1170 if (uniqueNames.count(qualFileName) == 0) {
1171 outputNames.push_back(qualFileName);
1172 outputTypes["qfile"].push_back(qualFileName);
1175 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1176 m->openOutputFile(qualFileName, temp); temp.close();
1180 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1181 if (uniqueNames.count(nameFileName) == 0) {
1182 outputNames.push_back(nameFileName);
1183 outputTypes["name"].push_back(nameFileName);
1186 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1187 m->openOutputFile(nameFileName, temp); temp.close();
1193 numFPrimers = primers.size();
1194 numRPrimers = revPrimer.size();
1196 bool allBlank = true;
1197 for (int i = 0; i < barcodeNameVector.size(); i++) {
1198 if (barcodeNameVector[i] != "") {
1203 for (int i = 0; i < primerNameVector.size(); i++) {
1204 if (primerNameVector[i] != "") {
1211 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1219 catch(exception& e) {
1220 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1224 //***************************************************************************************************************
1226 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1229 if(qscores.getName() != ""){
1230 qscores.trimQScores(-1, keepFirst);
1232 sequence.trim(keepFirst);
1235 catch(exception& e) {
1236 m->errorOut(e, "keepFirstTrim", "countDiffs");
1242 //***************************************************************************************************************
1244 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1248 int length = sequence.getNumBases() - removeLast;
1251 if(qscores.getName() != ""){
1252 qscores.trimQScores(-1, length);
1254 sequence.trim(length);
1263 catch(exception& e) {
1264 m->errorOut(e, "removeLastTrim", "countDiffs");
1270 //***************************************************************************************************************
1272 bool TrimSeqsCommand::cullLength(Sequence& seq){
1275 int length = seq.getNumBases();
1276 bool success = 0; //guilty until proven innocent
1278 if(length >= minLength && maxLength == 0) { success = 1; }
1279 else if(length >= minLength && length <= maxLength) { success = 1; }
1280 else { success = 0; }
1285 catch(exception& e) {
1286 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1292 //***************************************************************************************************************
1294 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1296 int longHomoP = seq.getLongHomoPolymer();
1297 bool success = 0; //guilty until proven innocent
1299 if(longHomoP <= maxHomoP){ success = 1; }
1300 else { success = 0; }
1304 catch(exception& e) {
1305 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1311 //***************************************************************************************************************
1313 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1315 int numNs = seq.getAmbigBases();
1316 bool success = 0; //guilty until proven innocent
1318 if(numNs <= maxAmbig) { success = 1; }
1319 else { success = 0; }
1323 catch(exception& e) {
1324 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1329 //***************************************************************************************************************