5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
12 #include "trimoligos.h"
14 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
19 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
20 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
21 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
22 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
23 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
24 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
25 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
26 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
27 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
28 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
29 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
30 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
31 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
32 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
33 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
34 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
35 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
36 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
37 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
38 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
39 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
40 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
41 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
43 vector<string> myArray;
44 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
48 m->errorOut(e, "TrimSeqsCommand", "setParameters");
52 //**********************************************************************************************************************
53 string TrimSeqsCommand::getHelpString(){
55 string helpString = "";
56 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
57 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
58 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
59 helpString += "The fasta parameter is required.\n";
60 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
61 helpString += "The oligos parameter allows you to provide an oligos file.\n";
62 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
63 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
64 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
65 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
66 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
67 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
68 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
69 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
70 helpString += "The qfile parameter allows you to provide a quality file.\n";
71 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
72 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
73 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
74 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
75 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
76 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
77 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
78 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
79 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
80 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
81 helpString += "The trim.seqs command should be in the following format: \n";
82 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
83 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
84 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
85 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
86 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
90 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
96 //**********************************************************************************************************************
98 TrimSeqsCommand::TrimSeqsCommand(){
100 abort = true; calledHelp = true;
102 vector<string> tempOutNames;
103 outputTypes["fasta"] = tempOutNames;
104 outputTypes["qfile"] = tempOutNames;
105 outputTypes["group"] = tempOutNames;
106 outputTypes["name"] = tempOutNames;
108 catch(exception& e) {
109 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
113 //***************************************************************************************************************
115 TrimSeqsCommand::TrimSeqsCommand(string option) {
118 abort = false; calledHelp = false;
121 //allow user to run help
122 if(option == "help") { help(); abort = true; calledHelp = true; }
123 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
126 vector<string> myArray = setParameters();
128 OptionParser parser(option);
129 map<string,string> parameters = parser.getParameters();
131 ValidParameters validParameter;
132 map<string,string>::iterator it;
134 //check to make sure all parameters are valid for command
135 for (it = parameters.begin(); it != parameters.end(); it++) {
136 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
139 //initialize outputTypes
140 vector<string> tempOutNames;
141 outputTypes["fasta"] = tempOutNames;
142 outputTypes["qfile"] = tempOutNames;
143 outputTypes["group"] = tempOutNames;
144 outputTypes["name"] = tempOutNames;
146 //if the user changes the input directory command factory will send this info to us in the output parameter
147 string inputDir = validParameter.validFile(parameters, "inputdir", false);
148 if (inputDir == "not found"){ inputDir = ""; }
151 it = parameters.find("fasta");
152 //user has given a template file
153 if(it != parameters.end()){
154 path = m->hasPath(it->second);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { parameters["fasta"] = inputDir + it->second; }
159 it = parameters.find("oligos");
160 //user has given a template file
161 if(it != parameters.end()){
162 path = m->hasPath(it->second);
163 //if the user has not given a path then, add inputdir. else leave path alone.
164 if (path == "") { parameters["oligos"] = inputDir + it->second; }
167 it = parameters.find("qfile");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["qfile"] = inputDir + it->second; }
175 it = parameters.find("name");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["name"] = inputDir + it->second; }
186 //check for required parameters
187 fastaFile = validParameter.validFile(parameters, "fasta", true);
188 if (fastaFile == "not found") {
189 fastaFile = m->getFastaFile();
190 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
191 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
192 }else if (fastaFile == "not open") { abort = true; }
193 else { m->setFastaFile(fastaFile); }
195 //if the user changes the output directory command factory will send this info to us in the output parameter
196 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
198 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
202 //check for optional parameter and set defaults
203 // ...at some point should added some additional type checking...
205 temp = validParameter.validFile(parameters, "flip", false);
206 if (temp == "not found") { flip = 0; }
207 else { flip = m->isTrue(temp); }
209 temp = validParameter.validFile(parameters, "oligos", true);
210 if (temp == "not found"){ oligoFile = ""; }
211 else if(temp == "not open"){ abort = true; }
212 else { oligoFile = temp; m->setOligosFile(oligoFile); }
215 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
216 convert(temp, maxAmbig);
218 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
219 convert(temp, maxHomoP);
221 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
222 convert(temp, minLength);
224 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
225 convert(temp, maxLength);
227 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
228 convert(temp, bdiffs);
230 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
231 convert(temp, pdiffs);
233 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
234 convert(temp, tdiffs);
236 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
238 temp = validParameter.validFile(parameters, "qfile", true);
239 if (temp == "not found") { qFileName = ""; }
240 else if(temp == "not open") { abort = true; }
241 else { qFileName = temp; m->setQualFile(qFileName); }
243 temp = validParameter.validFile(parameters, "name", true);
244 if (temp == "not found") { nameFile = ""; }
245 else if(temp == "not open") { nameFile = ""; abort = true; }
246 else { nameFile = temp; m->setNameFile(nameFile); }
248 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
249 convert(temp, qThreshold);
251 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
252 qtrim = m->isTrue(temp);
254 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
255 convert(temp, qRollAverage);
257 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
258 convert(temp, qWindowAverage);
260 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
261 convert(temp, qWindowSize);
263 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
264 convert(temp, qWindowStep);
266 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
267 convert(temp, qAverage);
269 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
270 convert(temp, keepFirst);
272 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
273 convert(temp, removeLast);
275 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
276 allFiles = m->isTrue(temp);
278 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
279 m->setProcessors(temp);
280 convert(temp, processors);
283 if(allFiles && (oligoFile == "")){
284 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
286 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
287 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
291 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
292 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
298 catch(exception& e) {
299 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
303 //***************************************************************************************************************
305 int TrimSeqsCommand::execute(){
308 if (abort == true) { if (calledHelp) { return 0; } return 2; }
310 numFPrimers = 0; //this needs to be initialized
313 vector<vector<string> > fastaFileNames;
314 vector<vector<string> > qualFileNames;
315 vector<vector<string> > nameFileNames;
317 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
318 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
320 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
321 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
323 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
324 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
326 if (qFileName != "") {
327 outputNames.push_back(trimQualFile);
328 outputNames.push_back(scrapQualFile);
329 outputTypes["qfile"].push_back(trimQualFile);
330 outputTypes["qfile"].push_back(scrapQualFile);
333 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
334 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
336 if (nameFile != "") {
337 m->readNames(nameFile, nameMap);
338 outputNames.push_back(trimNameFile);
339 outputNames.push_back(scrapNameFile);
340 outputTypes["name"].push_back(trimNameFile);
341 outputTypes["name"].push_back(scrapNameFile);
344 if (m->control_pressed) { return 0; }
346 string outputGroupFileName;
348 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
350 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
351 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
355 vector<unsigned long long> fastaFilePos;
356 vector<unsigned long long> qFilePos;
358 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
360 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
361 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
362 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
364 if(qFileName == "") { qLines = lines; } //files with duds
366 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
368 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
370 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
373 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
376 if (m->control_pressed) { return 0; }
379 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
380 map<string, string>::iterator it;
381 set<string> namesToRemove;
382 for(int i=0;i<fastaFileNames.size();i++){
383 for(int j=0;j<fastaFileNames[0].size();j++){
384 if (fastaFileNames[i][j] != "") {
385 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
386 if(m->isBlank(fastaFileNames[i][j])){
387 m->mothurRemove(fastaFileNames[i][j]);
388 namesToRemove.insert(fastaFileNames[i][j]);
391 m->mothurRemove(qualFileNames[i][j]);
392 namesToRemove.insert(qualFileNames[i][j]);
396 m->mothurRemove(nameFileNames[i][j]);
397 namesToRemove.insert(nameFileNames[i][j]);
400 it = uniqueFastaNames.find(fastaFileNames[i][j]);
401 if (it == uniqueFastaNames.end()) {
402 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
410 //remove names for outputFileNames, just cleans up the output
411 vector<string> outputNames2;
412 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
413 outputNames = outputNames2;
415 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
417 m->openInputFile(it->first, in);
420 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
421 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
422 m->openOutputFile(thisGroupName, out);
425 if (m->control_pressed) { break; }
427 Sequence currSeq(in); m->gobble(in);
428 out << currSeq.getName() << '\t' << it->second << endl;
435 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
437 //output group counts
438 m->mothurOutEndLine();
440 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
441 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
442 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
444 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
446 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
448 //set fasta file as new current fastafile
450 itTypes = outputTypes.find("fasta");
451 if (itTypes != outputTypes.end()) {
452 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
455 itTypes = outputTypes.find("name");
456 if (itTypes != outputTypes.end()) {
457 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
460 itTypes = outputTypes.find("qfile");
461 if (itTypes != outputTypes.end()) {
462 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
465 itTypes = outputTypes.find("group");
466 if (itTypes != outputTypes.end()) {
467 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
470 m->mothurOutEndLine();
471 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
472 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
473 m->mothurOutEndLine();
478 catch(exception& e) {
479 m->errorOut(e, "TrimSeqsCommand", "execute");
484 /**************************************************************************************/
486 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
490 ofstream trimFASTAFile;
491 m->openOutputFile(trimFileName, trimFASTAFile);
493 ofstream scrapFASTAFile;
494 m->openOutputFile(scrapFileName, scrapFASTAFile);
496 ofstream trimQualFile;
497 ofstream scrapQualFile;
499 m->openOutputFile(trimQFileName, trimQualFile);
500 m->openOutputFile(scrapQFileName, scrapQualFile);
503 ofstream trimNameFile;
504 ofstream scrapNameFile;
506 m->openOutputFile(trimNFileName, trimNameFile);
507 m->openOutputFile(scrapNFileName, scrapNameFile);
511 ofstream outGroupsFile;
512 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
514 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
515 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
516 if (fastaFileNames[i][j] != "") {
518 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
520 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
524 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
532 m->openInputFile(filename, inFASTA);
533 inFASTA.seekg(line->start);
536 if(qFileName != "") {
537 m->openInputFile(qFileName, qFile);
538 qFile.seekg(qline->start);
543 TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
547 if (m->control_pressed) {
548 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
549 if (createGroup) { outGroupsFile.close(); }
554 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
560 string trashCode = "";
561 int currentSeqsDiffs = 0;
563 Sequence currSeq(inFASTA); m->gobble(inFASTA);
564 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
565 QualityScores currQual;
567 currQual = QualityScores(qFile); m->gobble(qFile);
570 string origSeq = currSeq.getUnaligned();
573 int barcodeIndex = 0;
576 if(barcodes.size() != 0){
577 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
578 if(success > bdiffs) { trashCode += 'b'; }
579 else{ currentSeqsDiffs += success; }
582 if(numFPrimers != 0){
583 success = trimOligos.stripForward(currSeq, currQual, primerIndex);
584 if(success > pdiffs) { trashCode += 'f'; }
585 else{ currentSeqsDiffs += success; }
588 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
590 if(numRPrimers != 0){
591 success = trimOligos.stripReverse(currSeq, currQual);
592 if(!success) { trashCode += 'r'; }
596 success = keepFirstTrim(currSeq, currQual);
600 success = removeLastTrim(currSeq, currQual);
601 if(!success) { trashCode += 'l'; }
606 int origLength = currSeq.getNumBases();
608 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
609 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
610 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
611 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
612 else { success = 1; }
614 //you don't want to trim, if it fails above then scrap it
615 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
617 if(!success) { trashCode += 'q'; }
620 if(minLength > 0 || maxLength > 0){
621 success = cullLength(currSeq);
622 if(!success) { trashCode += 'l'; }
625 success = cullHomoP(currSeq);
626 if(!success) { trashCode += 'h'; }
629 success = cullAmbigs(currSeq);
630 if(!success) { trashCode += 'n'; }
633 if(flip){ // should go last
634 currSeq.reverseComplement();
636 currQual.flipQScores();
640 if(trashCode.length() == 0){
641 currSeq.setAligned(currSeq.getUnaligned());
642 currSeq.printSequence(trimFASTAFile);
645 currQual.printQScores(trimQualFile);
649 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
650 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
651 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
655 if(barcodes.size() != 0){
656 string thisGroup = barcodeNameVector[barcodeIndex];
657 if (primers.size() != 0) {
658 if (primerNameVector[primerIndex] != "") {
659 if(thisGroup != "") {
660 thisGroup += "." + primerNameVector[primerIndex];
662 thisGroup = primerNameVector[primerIndex];
667 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
669 if (nameFile != "") {
670 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
671 if (itName != nameMap.end()) {
672 vector<string> thisSeqsNames;
673 m->splitAtChar(itName->second, thisSeqsNames, ',');
674 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
675 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
677 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
680 map<string, int>::iterator it = groupCounts.find(thisGroup);
681 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
682 else { groupCounts[it->first]++; }
689 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
690 currSeq.printSequence(output);
694 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
695 currQual.printQScores(output);
700 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
701 if (itName != nameMap.end()) {
702 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
703 output << itName->first << '\t' << itName->second << endl;
705 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
710 if(nameFile != ""){ //needs to be before the currSeq name is changed
711 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
712 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
713 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
715 currSeq.setName(currSeq.getName() + '|' + trashCode);
716 currSeq.setUnaligned(origSeq);
717 currSeq.setAligned(origSeq);
718 currSeq.printSequence(scrapFASTAFile);
720 currQual.printQScores(scrapQualFile);
726 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
727 unsigned long long pos = inFASTA.tellg();
728 if ((pos == -1) || (pos >= line->end)) { break; }
731 if (inFASTA.eof()) { break; }
735 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
739 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
743 trimFASTAFile.close();
744 scrapFASTAFile.close();
745 if (createGroup) { outGroupsFile.close(); }
746 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
747 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
751 catch(exception& e) {
752 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
757 /**************************************************************************************************/
759 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
761 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
766 //loop through and create all the processes you want
767 while (process != processors) {
771 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
775 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
776 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
777 vector<vector<string> > tempNameFileNames = nameFileNames;
782 for(int i=0;i<tempFASTAFileNames.size();i++){
783 for(int j=0;j<tempFASTAFileNames[i].size();j++){
784 if (tempFASTAFileNames[i][j] != "") {
785 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
786 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
789 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
790 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
793 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
794 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
801 driverCreateTrim(filename,
803 (trimFASTAFileName + toString(getpid()) + ".temp"),
804 (scrapFASTAFileName + toString(getpid()) + ".temp"),
805 (trimQualFileName + toString(getpid()) + ".temp"),
806 (scrapQualFileName + toString(getpid()) + ".temp"),
807 (trimNameFileName + toString(getpid()) + ".temp"),
808 (scrapNameFileName + toString(getpid()) + ".temp"),
809 (groupFile + toString(getpid()) + ".temp"),
811 tempPrimerQualFileNames,
816 //pass groupCounts to parent
819 string tempFile = filename + toString(getpid()) + ".num.temp";
820 m->openOutputFile(tempFile, out);
822 out << groupCounts.size() << endl;
824 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
825 out << it->first << '\t' << it->second << endl;
831 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
832 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
839 m->openOutputFile(trimFASTAFileName, temp); temp.close();
840 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
842 m->openOutputFile(trimQualFileName, temp); temp.close();
843 m->openOutputFile(scrapQualFileName, temp); temp.close();
845 if (nameFile != "") {
846 m->openOutputFile(trimNameFileName, temp); temp.close();
847 m->openOutputFile(scrapNameFileName, temp); temp.close();
850 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
852 //force parent to wait until all the processes are done
853 for (int i=0;i<processIDS.size();i++) {
854 int temp = processIDS[i];
859 for(int i=0;i<processIDS.size();i++){
861 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
863 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
864 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
865 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
866 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
869 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
870 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
871 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
872 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
876 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
877 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
878 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
879 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
883 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
884 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
889 for(int j=0;j<fastaFileNames.size();j++){
890 for(int k=0;k<fastaFileNames[j].size();k++){
891 if (fastaFileNames[j][k] != "") {
892 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
893 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
896 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
897 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
901 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
902 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
911 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
912 m->openInputFile(tempFile, in);
916 in >> tempNum; m->gobble(in);
920 in >> group >> tempNum; m->gobble(in);
922 map<string, int>::iterator it = groupCounts.find(group);
923 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
924 else { groupCounts[it->first] += tempNum; }
927 in.close(); m->mothurRemove(tempFile);
935 catch(exception& e) {
936 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
941 /**************************************************************************************************/
943 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
945 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
946 //set file positions for fasta file
947 fastaFilePos = m->divideFile(filename, processors);
949 if (qfilename == "") { return processors; }
951 //get name of first sequence in each chunk
952 map<string, int> firstSeqNames;
953 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
955 m->openInputFile(filename, in);
956 in.seekg(fastaFilePos[i]);
959 firstSeqNames[temp.getName()] = i;
964 //seach for filePos of each first name in the qfile and save in qfileFilePos
966 m->openInputFile(qfilename, inQual);
969 while(!inQual.eof()){
970 input = m->getline(inQual);
972 if (input.length() != 0) {
973 if(input[0] == '>'){ //this is a sequence name line
974 istringstream nameStream(input);
976 string sname = ""; nameStream >> sname;
977 sname = sname.substr(1);
979 map<string, int>::iterator it = firstSeqNames.find(sname);
981 if(it != firstSeqNames.end()) { //this is the start of a new chunk
982 unsigned long long pos = inQual.tellg();
983 qfileFilePos.push_back(pos - input.length() - 1);
984 firstSeqNames.erase(it);
989 if (firstSeqNames.size() == 0) { break; }
994 if (firstSeqNames.size() != 0) {
995 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
996 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1002 //get last file position of qfile
1004 unsigned long long size;
1006 //get num bytes in file
1007 pFile = fopen (qfilename.c_str(),"rb");
1008 if (pFile==NULL) perror ("Error opening file");
1010 fseek (pFile, 0, SEEK_END);
1015 qfileFilePos.push_back(size);
1021 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1022 fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
1027 catch(exception& e) {
1028 m->errorOut(e, "TrimSeqsCommand", "setLines");
1033 //***************************************************************************************************************
1035 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1038 m->openInputFile(oligoFile, inOligos);
1042 string type, oligo, group;
1044 int indexPrimer = 0;
1045 int indexBarcode = 0;
1047 while(!inOligos.eof()){
1052 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1053 m->gobble(inOligos);
1056 m->gobble(inOligos);
1057 //make type case insensitive
1058 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1062 for(int i=0;i<oligo.length();i++){
1063 oligo[i] = toupper(oligo[i]);
1064 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1067 if(type == "FORWARD"){
1070 // get rest of line in case there is a primer name
1071 while (!inOligos.eof()) {
1072 char c = inOligos.get();
1073 if (c == 10 || c == 13){ break; }
1074 else if (c == 32 || c == 9){;} //space or tab
1075 else { group += c; }
1078 //check for repeat barcodes
1079 map<string, int>::iterator itPrime = primers.find(oligo);
1080 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1082 primers[oligo]=indexPrimer; indexPrimer++;
1083 primerNameVector.push_back(group);
1085 else if(type == "REVERSE"){
1086 Sequence oligoRC("reverse", oligo);
1087 oligoRC.reverseComplement();
1088 revPrimer.push_back(oligoRC.getUnaligned());
1090 else if(type == "BARCODE"){
1093 //check for repeat barcodes
1094 map<string, int>::iterator itBar = barcodes.find(oligo);
1095 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1097 barcodes[oligo]=indexBarcode; indexBarcode++;
1098 barcodeNameVector.push_back(group);
1100 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1102 m->gobble(inOligos);
1106 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1108 //add in potential combos
1109 if(barcodeNameVector.size() == 0){
1111 barcodeNameVector.push_back("");
1114 if(primerNameVector.size() == 0){
1116 primerNameVector.push_back("");
1119 fastaFileNames.resize(barcodeNameVector.size());
1120 for(int i=0;i<fastaFileNames.size();i++){
1121 fastaFileNames[i].assign(primerNameVector.size(), "");
1123 if(qFileName != "") { qualFileNames = fastaFileNames; }
1124 if(nameFile != "") { nameFileNames = fastaFileNames; }
1127 set<string> uniqueNames; //used to cleanup outputFileNames
1128 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1129 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1131 string primerName = primerNameVector[itPrimer->second];
1132 string barcodeName = barcodeNameVector[itBar->second];
1134 string comboGroupName = "";
1135 string fastaFileName = "";
1136 string qualFileName = "";
1137 string nameFileName = "";
1139 if(primerName == ""){
1140 comboGroupName = barcodeNameVector[itBar->second];
1143 if(barcodeName == ""){
1144 comboGroupName = primerNameVector[itPrimer->second];
1147 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1153 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1154 if (uniqueNames.count(fastaFileName) == 0) {
1155 outputNames.push_back(fastaFileName);
1156 outputTypes["fasta"].push_back(fastaFileName);
1157 uniqueNames.insert(fastaFileName);
1160 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1161 m->openOutputFile(fastaFileName, temp); temp.close();
1163 if(qFileName != ""){
1164 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1165 if (uniqueNames.count(qualFileName) == 0) {
1166 outputNames.push_back(qualFileName);
1167 outputTypes["qfile"].push_back(qualFileName);
1170 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1171 m->openOutputFile(qualFileName, temp); temp.close();
1175 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1176 if (uniqueNames.count(nameFileName) == 0) {
1177 outputNames.push_back(nameFileName);
1178 outputTypes["name"].push_back(nameFileName);
1181 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1182 m->openOutputFile(nameFileName, temp); temp.close();
1188 numFPrimers = primers.size();
1189 numRPrimers = revPrimer.size();
1191 bool allBlank = true;
1192 for (int i = 0; i < barcodeNameVector.size(); i++) {
1193 if (barcodeNameVector[i] != "") {
1198 for (int i = 0; i < primerNameVector.size(); i++) {
1199 if (primerNameVector[i] != "") {
1206 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1214 catch(exception& e) {
1215 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1219 //***************************************************************************************************************
1221 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1224 if(qscores.getName() != ""){
1225 qscores.trimQScores(-1, keepFirst);
1227 sequence.trim(keepFirst);
1230 catch(exception& e) {
1231 m->errorOut(e, "keepFirstTrim", "countDiffs");
1237 //***************************************************************************************************************
1239 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1243 int length = sequence.getNumBases() - removeLast;
1246 if(qscores.getName() != ""){
1247 qscores.trimQScores(-1, length);
1249 sequence.trim(length);
1258 catch(exception& e) {
1259 m->errorOut(e, "removeLastTrim", "countDiffs");
1265 //***************************************************************************************************************
1267 bool TrimSeqsCommand::cullLength(Sequence& seq){
1270 int length = seq.getNumBases();
1271 bool success = 0; //guilty until proven innocent
1273 if(length >= minLength && maxLength == 0) { success = 1; }
1274 else if(length >= minLength && length <= maxLength) { success = 1; }
1275 else { success = 0; }
1280 catch(exception& e) {
1281 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1287 //***************************************************************************************************************
1289 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1291 int longHomoP = seq.getLongHomoPolymer();
1292 bool success = 0; //guilty until proven innocent
1294 if(longHomoP <= maxHomoP){ success = 1; }
1295 else { success = 0; }
1299 catch(exception& e) {
1300 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1306 //***************************************************************************************************************
1308 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1310 int numNs = seq.getAmbigBases();
1311 bool success = 0; //guilty until proven innocent
1313 if(numNs <= maxAmbig) { success = 1; }
1314 else { success = 0; }
1318 catch(exception& e) {
1319 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1324 //***************************************************************************************************************