5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
14 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
27 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
28 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
29 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
30 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
31 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
32 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
33 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
34 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
35 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
36 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
37 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
38 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
39 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
40 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
41 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
43 vector<string> myArray;
44 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
48 m->errorOut(e, "TrimSeqsCommand", "setParameters");
52 //**********************************************************************************************************************
53 string TrimSeqsCommand::getHelpString(){
55 string helpString = "";
56 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
57 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
58 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
59 helpString += "The fasta parameter is required.\n";
60 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
61 helpString += "The oligos parameter allows you to provide an oligos file.\n";
62 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
63 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
64 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
65 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
66 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
67 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
68 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
69 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
70 helpString += "The qfile parameter allows you to provide a quality file.\n";
71 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
72 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
73 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
74 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
75 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
76 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
77 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
78 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
79 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
80 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
81 helpString += "The trim.seqs command should be in the following format: \n";
82 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
83 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
84 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
85 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
86 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
90 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
96 //**********************************************************************************************************************
98 TrimSeqsCommand::TrimSeqsCommand(){
100 abort = true; calledHelp = true;
102 vector<string> tempOutNames;
103 outputTypes["fasta"] = tempOutNames;
104 outputTypes["qfile"] = tempOutNames;
105 outputTypes["group"] = tempOutNames;
106 outputTypes["name"] = tempOutNames;
108 catch(exception& e) {
109 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
113 //***************************************************************************************************************
115 TrimSeqsCommand::TrimSeqsCommand(string option) {
118 abort = false; calledHelp = false;
121 //allow user to run help
122 if(option == "help") { help(); abort = true; calledHelp = true; }
123 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
126 vector<string> myArray = setParameters();
128 OptionParser parser(option);
129 map<string,string> parameters = parser.getParameters();
131 ValidParameters validParameter;
132 map<string,string>::iterator it;
134 //check to make sure all parameters are valid for command
135 for (it = parameters.begin(); it != parameters.end(); it++) {
136 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
139 //initialize outputTypes
140 vector<string> tempOutNames;
141 outputTypes["fasta"] = tempOutNames;
142 outputTypes["qfile"] = tempOutNames;
143 outputTypes["group"] = tempOutNames;
144 outputTypes["name"] = tempOutNames;
146 //if the user changes the input directory command factory will send this info to us in the output parameter
147 string inputDir = validParameter.validFile(parameters, "inputdir", false);
148 if (inputDir == "not found"){ inputDir = ""; }
151 it = parameters.find("fasta");
152 //user has given a template file
153 if(it != parameters.end()){
154 path = m->hasPath(it->second);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { parameters["fasta"] = inputDir + it->second; }
159 it = parameters.find("oligos");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["oligos"] = inputDir + it->second; }
167 it = parameters.find("qfile");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["qfile"] = inputDir + it->second; }
175 it = parameters.find("name");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["name"] = inputDir + it->second; }
186 //check for required parameters
187 fastaFile = validParameter.validFile(parameters, "fasta", true);
188 if (fastaFile == "not found") {
189 fastaFile = m->getFastaFile();
190 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
191 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
192 }else if (fastaFile == "not open") { abort = true; }
193 else { m->setFastaFile(fastaFile); }
195 //if the user changes the output directory command factory will send this info to us in the output parameter
196 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
198 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
202 //check for optional parameter and set defaults
203 // ...at some point should added some additional type checking...
205 temp = validParameter.validFile(parameters, "flip", false);
206 if (temp == "not found"){ flip = 0; }
207 else if(m->isTrue(temp)) { flip = 1; }
209 temp = validParameter.validFile(parameters, "oligos", true);
210 if (temp == "not found"){ oligoFile = ""; }
211 else if(temp == "not open"){ abort = true; }
212 else { oligoFile = temp; m->setOligosFile(oligoFile); }
215 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
216 convert(temp, maxAmbig);
218 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
219 convert(temp, maxHomoP);
221 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
222 convert(temp, minLength);
224 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
225 convert(temp, maxLength);
227 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
228 convert(temp, bdiffs);
230 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
231 convert(temp, pdiffs);
233 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
234 convert(temp, tdiffs);
236 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
238 temp = validParameter.validFile(parameters, "qfile", true);
239 if (temp == "not found") { qFileName = ""; }
240 else if(temp == "not open") { abort = true; }
241 else { qFileName = temp; m->setQualFile(qFileName); }
243 temp = validParameter.validFile(parameters, "name", true);
244 if (temp == "not found") { nameFile = ""; }
245 else if(temp == "not open") { nameFile = ""; abort = true; }
246 else { nameFile = temp; m->setNameFile(nameFile); }
248 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
249 convert(temp, qThreshold);
251 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
252 qtrim = m->isTrue(temp);
254 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
255 convert(temp, qRollAverage);
257 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
258 convert(temp, qWindowAverage);
260 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
261 convert(temp, qWindowSize);
263 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
264 convert(temp, qWindowStep);
266 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
267 convert(temp, qAverage);
269 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
270 convert(temp, keepFirst);
272 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
273 convert(temp, removeLast);
275 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
276 allFiles = m->isTrue(temp);
278 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
279 m->setProcessors(temp);
280 convert(temp, processors);
283 if(allFiles && (oligoFile == "")){
284 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
286 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
287 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
291 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
292 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
298 catch(exception& e) {
299 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
303 //***************************************************************************************************************
305 int TrimSeqsCommand::execute(){
308 if (abort == true) { if (calledHelp) { return 0; } return 2; }
310 numFPrimers = 0; //this needs to be initialized
313 vector<vector<string> > fastaFileNames;
314 vector<vector<string> > qualFileNames;
315 vector<vector<string> > nameFileNames;
317 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
318 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
320 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
321 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
323 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
324 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
326 if (qFileName != "") {
327 outputNames.push_back(trimQualFile);
328 outputNames.push_back(scrapQualFile);
329 outputTypes["qfile"].push_back(trimQualFile);
330 outputTypes["qfile"].push_back(scrapQualFile);
333 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
334 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
336 if (nameFile != "") {
337 m->readNames(nameFile, nameMap);
338 outputNames.push_back(trimNameFile);
339 outputNames.push_back(scrapNameFile);
340 outputTypes["name"].push_back(trimNameFile);
341 outputTypes["name"].push_back(scrapNameFile);
344 if (m->control_pressed) { return 0; }
346 string outputGroupFileName;
348 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
350 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
351 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
355 vector<unsigned long long> fastaFilePos;
356 vector<unsigned long long> qFilePos;
358 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
360 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
361 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
362 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
364 if(qFileName == "") { qLines = lines; } //files with duds
366 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
368 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
370 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
373 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
376 if (m->control_pressed) { return 0; }
379 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
380 map<string, string>::iterator it;
381 set<string> namesToRemove;
382 for(int i=0;i<fastaFileNames.size();i++){
383 for(int j=0;j<fastaFileNames[0].size();j++){
384 if (fastaFileNames[i][j] != "") {
385 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
386 if(m->isBlank(fastaFileNames[i][j])){
387 m->mothurRemove(fastaFileNames[i][j]);
388 namesToRemove.insert(fastaFileNames[i][j]);
391 m->mothurRemove(qualFileNames[i][j]);
392 namesToRemove.insert(qualFileNames[i][j]);
396 m->mothurRemove(nameFileNames[i][j]);
397 namesToRemove.insert(nameFileNames[i][j]);
400 it = uniqueFastaNames.find(fastaFileNames[i][j]);
401 if (it == uniqueFastaNames.end()) {
402 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
410 //remove names for outputFileNames, just cleans up the output
411 vector<string> outputNames2;
412 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
413 outputNames = outputNames2;
415 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
417 m->openInputFile(it->first, in);
420 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
421 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
422 m->openOutputFile(thisGroupName, out);
425 if (m->control_pressed) { break; }
427 Sequence currSeq(in); m->gobble(in);
428 out << currSeq.getName() << '\t' << it->second << endl;
435 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
437 //output group counts
438 m->mothurOutEndLine();
440 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
441 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
442 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
444 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
446 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
448 //set fasta file as new current fastafile
450 itTypes = outputTypes.find("fasta");
451 if (itTypes != outputTypes.end()) {
452 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
455 itTypes = outputTypes.find("name");
456 if (itTypes != outputTypes.end()) {
457 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
460 itTypes = outputTypes.find("qfile");
461 if (itTypes != outputTypes.end()) {
462 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
465 itTypes = outputTypes.find("group");
466 if (itTypes != outputTypes.end()) {
467 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
470 m->mothurOutEndLine();
471 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
472 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
473 m->mothurOutEndLine();
478 catch(exception& e) {
479 m->errorOut(e, "TrimSeqsCommand", "execute");
484 /**************************************************************************************/
486 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
490 ofstream trimFASTAFile;
491 m->openOutputFile(trimFileName, trimFASTAFile);
493 ofstream scrapFASTAFile;
494 m->openOutputFile(scrapFileName, scrapFASTAFile);
496 ofstream trimQualFile;
497 ofstream scrapQualFile;
499 m->openOutputFile(trimQFileName, trimQualFile);
500 m->openOutputFile(scrapQFileName, scrapQualFile);
503 ofstream trimNameFile;
504 ofstream scrapNameFile;
506 m->openOutputFile(trimNFileName, trimNameFile);
507 m->openOutputFile(scrapNFileName, scrapNameFile);
511 ofstream outGroupsFile;
512 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
514 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
515 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
516 if (fastaFileNames[i][j] != "") {
518 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
520 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
524 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
532 m->openInputFile(filename, inFASTA);
533 inFASTA.seekg(line->start);
536 if(qFileName != "") {
537 m->openInputFile(qFileName, qFile);
538 qFile.seekg(qline->start);
543 TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
547 if (m->control_pressed) {
548 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
549 if (createGroup) { outGroupsFile.close(); }
554 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
560 string trashCode = "";
561 int currentSeqsDiffs = 0;
563 Sequence currSeq(inFASTA); m->gobble(inFASTA);
564 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
565 QualityScores currQual;
567 currQual = QualityScores(qFile); m->gobble(qFile);
570 string origSeq = currSeq.getUnaligned();
573 int barcodeIndex = 0;
576 if(barcodes.size() != 0){
577 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
578 if(success > bdiffs) { trashCode += 'b'; }
579 else{ currentSeqsDiffs += success; }
582 if(numFPrimers != 0){
583 success = trimOligos.stripForward(currSeq, currQual, primerIndex);
584 if(success > pdiffs) { trashCode += 'f'; }
585 else{ currentSeqsDiffs += success; }
588 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
590 if(numRPrimers != 0){
591 success = trimOligos.stripReverse(currSeq, currQual);
592 if(!success) { trashCode += 'r'; }
596 success = keepFirstTrim(currSeq, currQual);
600 success = removeLastTrim(currSeq, currQual);
601 if(!success) { trashCode += 'l'; }
606 int origLength = currSeq.getNumBases();
608 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
609 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
610 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
611 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
612 else { success = 1; }
614 //you don't want to trim, if it fails above then scrap it
615 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
617 if(!success) { trashCode += 'q'; }
620 if(minLength > 0 || maxLength > 0){
621 success = cullLength(currSeq);
622 if(!success) { trashCode += 'l'; }
625 success = cullHomoP(currSeq);
626 if(!success) { trashCode += 'h'; }
629 success = cullAmbigs(currSeq);
630 if(!success) { trashCode += 'n'; }
633 if(flip){ // should go last
634 currSeq.reverseComplement();
636 currQual.flipQScores();
640 if(trashCode.length() == 0){
641 currSeq.setAligned(currSeq.getUnaligned());
642 currSeq.printSequence(trimFASTAFile);
645 currQual.printQScores(trimQualFile);
649 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
650 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
651 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
655 if(barcodes.size() != 0){
656 string thisGroup = barcodeNameVector[barcodeIndex];
657 if (primers.size() != 0) {
658 if (primerNameVector[primerIndex] != "") {
659 if(thisGroup != "") {
660 thisGroup += "." + primerNameVector[primerIndex];
662 thisGroup = primerNameVector[primerIndex];
667 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
669 if (nameFile != "") {
670 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
671 if (itName != nameMap.end()) {
672 vector<string> thisSeqsNames;
673 m->splitAtChar(itName->second, thisSeqsNames, ',');
674 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
675 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
677 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
680 map<string, int>::iterator it = groupCounts.find(thisGroup);
681 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
682 else { groupCounts[it->first]++; }
689 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
690 currSeq.printSequence(output);
694 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
695 currQual.printQScores(output);
700 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
701 if (itName != nameMap.end()) {
702 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
703 output << itName->first << '\t' << itName->second << endl;
705 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
710 if(nameFile != ""){ //needs to be before the currSeq name is changed
711 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
712 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
713 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
715 currSeq.setName(currSeq.getName() + '|' + trashCode);
716 currSeq.setUnaligned(origSeq);
717 currSeq.setAligned(origSeq);
718 currSeq.printSequence(scrapFASTAFile);
720 currQual.printQScores(scrapQualFile);
726 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
727 unsigned long long pos = inFASTA.tellg();
728 if ((pos == -1) || (pos >= line->end)) { break; }
731 if (inFASTA.eof()) { break; }
735 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
739 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
743 trimFASTAFile.close();
744 scrapFASTAFile.close();
745 if (createGroup) { outGroupsFile.close(); }
746 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
747 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
751 catch(exception& e) {
752 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
757 /**************************************************************************************************/
759 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
761 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
766 //loop through and create all the processes you want
767 while (process != processors) {
771 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
775 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
776 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
777 vector<vector<string> > tempNameFileNames = nameFileNames;
782 for(int i=0;i<tempFASTAFileNames.size();i++){
783 for(int j=0;j<tempFASTAFileNames[i].size();j++){
784 if (tempFASTAFileNames[i][j] != "") {
785 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
786 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
789 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
790 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
793 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
794 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
801 driverCreateTrim(filename,
803 (trimFASTAFileName + toString(getpid()) + ".temp"),
804 (scrapFASTAFileName + toString(getpid()) + ".temp"),
805 (trimQualFileName + toString(getpid()) + ".temp"),
806 (scrapQualFileName + toString(getpid()) + ".temp"),
807 (trimNameFileName + toString(getpid()) + ".temp"),
808 (scrapNameFileName + toString(getpid()) + ".temp"),
809 (groupFile + toString(getpid()) + ".temp"),
811 tempPrimerQualFileNames,
816 //pass groupCounts to parent
819 string tempFile = filename + toString(getpid()) + ".num.temp";
820 m->openOutputFile(tempFile, out);
822 out << groupCounts.size() << endl;
824 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
825 out << it->first << '\t' << it->second << endl;
831 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
832 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
839 m->openOutputFile(trimFASTAFileName, temp); temp.close();
840 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
842 m->openOutputFile(trimQualFileName, temp); temp.close();
843 m->openOutputFile(scrapQualFileName, temp); temp.close();
845 if (nameFile != "") {
846 m->openOutputFile(trimNameFileName, temp); temp.close();
847 m->openOutputFile(scrapNameFileName, temp); temp.close();
850 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
852 //force parent to wait until all the processes are done
853 for (int i=0;i<processIDS.size();i++) {
854 int temp = processIDS[i];
859 for(int i=0;i<processIDS.size();i++){
861 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
863 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
864 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
865 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
866 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
869 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
870 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
871 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
872 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
876 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
877 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
878 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
879 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
883 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
884 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
889 for(int j=0;j<fastaFileNames.size();j++){
890 for(int k=0;k<fastaFileNames[j].size();k++){
891 if (fastaFileNames[j][k] != "") {
892 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
893 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
896 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
897 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
901 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
902 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
911 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
912 m->openInputFile(tempFile, in);
916 in >> tempNum; m->gobble(in);
920 in >> group >> tempNum; m->gobble(in);
922 map<string, int>::iterator it = groupCounts.find(group);
923 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
924 else { groupCounts[it->first] += tempNum; }
927 in.close(); m->mothurRemove(tempFile);
935 catch(exception& e) {
936 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
941 /**************************************************************************************************/
943 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
945 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
946 //set file positions for fasta file
947 fastaFilePos = m->divideFile(filename, processors);
949 if (qfilename == "") { return processors; }
951 //get name of first sequence in each chunk
952 map<string, int> firstSeqNames;
953 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
955 m->openInputFile(filename, in);
956 in.seekg(fastaFilePos[i]);
959 firstSeqNames[temp.getName()] = i;
964 //seach for filePos of each first name in the qfile and save in qfileFilePos
966 m->openInputFile(qfilename, inQual);
969 while(!inQual.eof()){
970 input = m->getline(inQual);
972 if (input.length() != 0) {
973 if(input[0] == '>'){ //this is a sequence name line
974 istringstream nameStream(input);
976 string sname = ""; nameStream >> sname;
977 sname = sname.substr(1);
979 map<string, int>::iterator it = firstSeqNames.find(sname);
981 if(it != firstSeqNames.end()) { //this is the start of a new chunk
982 unsigned long long pos = inQual.tellg();
983 qfileFilePos.push_back(pos - input.length() - 1);
984 firstSeqNames.erase(it);
989 if (firstSeqNames.size() == 0) { break; }
994 if (firstSeqNames.size() != 0) {
995 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
996 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1002 //get last file position of qfile
1004 unsigned long long size;
1006 //get num bytes in file
1007 pFile = fopen (qfilename.c_str(),"rb");
1008 if (pFile==NULL) perror ("Error opening file");
1010 fseek (pFile, 0, SEEK_END);
1015 qfileFilePos.push_back(size);
1021 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1022 //get last file position of fastafile
1024 unsigned long long size;
1026 //get num bytes in file
1027 pFile = fopen (filename.c_str(),"rb");
1028 if (pFile==NULL) perror ("Error opening file");
1030 fseek (pFile, 0, SEEK_END);
1034 fastaFilePos.push_back(size);
1036 //get last file position of fastafile
1039 //get num bytes in file
1040 qFile = fopen (qfilename.c_str(),"rb");
1041 if (qFile==NULL) perror ("Error opening file");
1043 fseek (qFile, 0, SEEK_END);
1047 qfileFilePos.push_back(size);
1053 catch(exception& e) {
1054 m->errorOut(e, "TrimSeqsCommand", "setLines");
1059 //***************************************************************************************************************
1061 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1064 m->openInputFile(oligoFile, inOligos);
1068 string type, oligo, group;
1070 int indexPrimer = 0;
1071 int indexBarcode = 0;
1073 while(!inOligos.eof()){
1078 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1079 m->gobble(inOligos);
1082 m->gobble(inOligos);
1083 //make type case insensitive
1084 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1088 for(int i=0;i<oligo.length();i++){
1089 oligo[i] = toupper(oligo[i]);
1090 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1093 if(type == "FORWARD"){
1096 // get rest of line in case there is a primer name
1097 while (!inOligos.eof()) {
1098 char c = inOligos.get();
1099 if (c == 10 || c == 13){ break; }
1100 else if (c == 32 || c == 9){;} //space or tab
1101 else { group += c; }
1104 //check for repeat barcodes
1105 map<string, int>::iterator itPrime = primers.find(oligo);
1106 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1108 primers[oligo]=indexPrimer; indexPrimer++;
1109 primerNameVector.push_back(group);
1111 else if(type == "REVERSE"){
1112 Sequence oligoRC("reverse", oligo);
1113 oligoRC.reverseComplement();
1114 revPrimer.push_back(oligoRC.getUnaligned());
1116 else if(type == "BARCODE"){
1119 //check for repeat barcodes
1120 map<string, int>::iterator itBar = barcodes.find(oligo);
1121 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1123 barcodes[oligo]=indexBarcode; indexBarcode++;
1124 barcodeNameVector.push_back(group);
1126 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1128 m->gobble(inOligos);
1132 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1134 //add in potential combos
1135 if(barcodeNameVector.size() == 0){
1137 barcodeNameVector.push_back("");
1140 if(primerNameVector.size() == 0){
1142 primerNameVector.push_back("");
1145 fastaFileNames.resize(barcodeNameVector.size());
1146 for(int i=0;i<fastaFileNames.size();i++){
1147 fastaFileNames[i].assign(primerNameVector.size(), "");
1149 if(qFileName != "") { qualFileNames = fastaFileNames; }
1150 if(nameFile != "") { nameFileNames = fastaFileNames; }
1153 set<string> uniqueNames; //used to cleanup outputFileNames
1154 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1155 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1157 string primerName = primerNameVector[itPrimer->second];
1158 string barcodeName = barcodeNameVector[itBar->second];
1160 string comboGroupName = "";
1161 string fastaFileName = "";
1162 string qualFileName = "";
1163 string nameFileName = "";
1165 if(primerName == ""){
1166 comboGroupName = barcodeNameVector[itBar->second];
1169 if(barcodeName == ""){
1170 comboGroupName = primerNameVector[itPrimer->second];
1173 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1179 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1180 if (uniqueNames.count(fastaFileName) == 0) {
1181 outputNames.push_back(fastaFileName);
1182 outputTypes["fasta"].push_back(fastaFileName);
1183 uniqueNames.insert(fastaFileName);
1186 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1187 m->openOutputFile(fastaFileName, temp); temp.close();
1189 if(qFileName != ""){
1190 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1191 if (uniqueNames.count(qualFileName) == 0) {
1192 outputNames.push_back(qualFileName);
1193 outputTypes["qfile"].push_back(qualFileName);
1196 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1197 m->openOutputFile(qualFileName, temp); temp.close();
1201 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1202 if (uniqueNames.count(nameFileName) == 0) {
1203 outputNames.push_back(nameFileName);
1204 outputTypes["name"].push_back(nameFileName);
1207 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1208 m->openOutputFile(nameFileName, temp); temp.close();
1214 numFPrimers = primers.size();
1215 numRPrimers = revPrimer.size();
1217 bool allBlank = true;
1218 for (int i = 0; i < barcodeNameVector.size(); i++) {
1219 if (barcodeNameVector[i] != "") {
1224 for (int i = 0; i < primerNameVector.size(); i++) {
1225 if (primerNameVector[i] != "") {
1232 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1240 catch(exception& e) {
1241 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1245 //***************************************************************************************************************
1247 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1250 if(qscores.getName() != ""){
1251 qscores.trimQScores(-1, keepFirst);
1253 sequence.trim(keepFirst);
1256 catch(exception& e) {
1257 m->errorOut(e, "keepFirstTrim", "countDiffs");
1263 //***************************************************************************************************************
1265 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1269 int length = sequence.getNumBases() - removeLast;
1272 if(qscores.getName() != ""){
1273 qscores.trimQScores(-1, length);
1275 sequence.trim(length);
1284 catch(exception& e) {
1285 m->errorOut(e, "removeLastTrim", "countDiffs");
1291 //***************************************************************************************************************
1293 bool TrimSeqsCommand::cullLength(Sequence& seq){
1296 int length = seq.getNumBases();
1297 bool success = 0; //guilty until proven innocent
1299 if(length >= minLength && maxLength == 0) { success = 1; }
1300 else if(length >= minLength && length <= maxLength) { success = 1; }
1301 else { success = 0; }
1306 catch(exception& e) {
1307 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1313 //***************************************************************************************************************
1315 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1317 int longHomoP = seq.getLongHomoPolymer();
1318 bool success = 0; //guilty until proven innocent
1320 if(longHomoP <= maxHomoP){ success = 1; }
1321 else { success = 0; }
1325 catch(exception& e) {
1326 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1332 //***************************************************************************************************************
1334 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1336 int numNs = seq.getAmbigBases();
1337 bool success = 0; //guilty until proven innocent
1339 if(numNs <= maxAmbig) { success = 1; }
1340 else { success = 0; }
1344 catch(exception& e) {
1345 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1350 //***************************************************************************************************************