5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::getValidParameters(){
17 string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile",
18 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19 "keepfirst", "removelast",
20 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
30 //**********************************************************************************************************************
32 TrimSeqsCommand::TrimSeqsCommand(){
34 abort = true; calledHelp = true;
35 vector<string> tempOutNames;
36 outputTypes["fasta"] = tempOutNames;
37 outputTypes["qual"] = tempOutNames;
38 outputTypes["group"] = tempOutNames;
41 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
46 //**********************************************************************************************************************
48 vector<string> TrimSeqsCommand::getRequiredParameters(){
50 string Array[] = {"fasta"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
60 //**********************************************************************************************************************
62 vector<string> TrimSeqsCommand::getRequiredFiles(){
64 vector<string> myArray;
68 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
73 //***************************************************************************************************************
75 TrimSeqsCommand::TrimSeqsCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
85 //valid paramters for this command
86 string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile",
87 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
88 "keepfirst", "removelast",
89 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
91 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
93 OptionParser parser(option);
94 map<string,string> parameters = parser.getParameters();
96 ValidParameters validParameter;
97 map<string,string>::iterator it;
99 //check to make sure all parameters are valid for command
100 for (it = parameters.begin(); it != parameters.end(); it++) {
101 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
104 //initialize outputTypes
105 vector<string> tempOutNames;
106 outputTypes["fasta"] = tempOutNames;
107 outputTypes["qual"] = tempOutNames;
108 outputTypes["group"] = tempOutNames;
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("fasta");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["fasta"] = inputDir + it->second; }
123 it = parameters.find("oligos");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["oligos"] = inputDir + it->second; }
131 it = parameters.find("qfile");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["qfile"] = inputDir + it->second; }
139 it = parameters.find("group");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["group"] = inputDir + it->second; }
149 //check for required parameters
150 fastaFile = validParameter.validFile(parameters, "fasta", true);
151 if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
152 else if (fastaFile == "not open") { abort = true; }
154 //if the user changes the output directory command factory will send this info to us in the output parameter
155 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
157 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
161 //check for optional parameter and set defaults
162 // ...at some point should added some additional type checking...
164 temp = validParameter.validFile(parameters, "flip", false);
165 if (temp == "not found"){ flip = 0; }
166 else if(m->isTrue(temp)) { flip = 1; }
168 temp = validParameter.validFile(parameters, "oligos", true);
169 if (temp == "not found"){ oligoFile = ""; }
170 else if(temp == "not open"){ abort = true; }
171 else { oligoFile = temp; }
173 temp = validParameter.validFile(parameters, "group", true);
174 if (temp == "not found"){ groupfile = ""; }
175 else if(temp == "not open"){ abort = true; }
176 else { groupfile = temp; }
178 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
179 convert(temp, maxAmbig);
181 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
182 convert(temp, maxHomoP);
184 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
185 convert(temp, minLength);
187 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
188 convert(temp, maxLength);
190 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
191 convert(temp, bdiffs);
193 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
194 convert(temp, pdiffs);
196 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
197 convert(temp, tdiffs);
199 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
201 temp = validParameter.validFile(parameters, "qfile", true);
202 if (temp == "not found") { qFileName = ""; }
203 else if(temp == "not open") { abort = true; }
204 else { qFileName = temp; }
206 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
207 convert(temp, qThreshold);
209 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
210 qtrim = m->isTrue(temp);
212 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
213 convert(temp, qRollAverage);
215 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
216 convert(temp, qWindowAverage);
218 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
219 convert(temp, qWindowSize);
221 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
222 convert(temp, qWindowStep);
224 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
225 convert(temp, qAverage);
227 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
228 convert(temp, keepFirst);
230 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
231 convert(temp, removeLast);
233 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
234 allFiles = m->isTrue(temp);
236 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
237 convert(temp, processors);
239 if ((oligoFile != "") && (groupfile != "")) {
240 m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true;
244 if(allFiles && (oligoFile == "") && (groupfile == "")){
245 m->mothurOut("You selected allfiles, but didn't enter an oligos or group file. Ignoring the allfiles request."); m->mothurOutEndLine();
247 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
248 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
252 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
253 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
259 catch(exception& e) {
260 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
265 //**********************************************************************************************************************
267 void TrimSeqsCommand::help(){
269 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
270 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
271 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, group, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
272 m->mothurOut("The fasta parameter is required.\n");
273 m->mothurOut("The group parameter allows you to enter a group file for your fasta file.\n");
274 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
275 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
276 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
277 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
278 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
279 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
280 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
281 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
282 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
283 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
284 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
285 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
286 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
287 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
288 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
289 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
290 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
291 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
292 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
293 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
294 m->mothurOut("The trim.seqs command should be in the following format: \n");
295 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n");
296 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n");
297 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
298 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
299 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
302 catch(exception& e) {
303 m->errorOut(e, "TrimSeqsCommand", "help");
309 //***************************************************************************************************************
311 TrimSeqsCommand::~TrimSeqsCommand(){ /* do nothing */ }
313 //***************************************************************************************************************
315 int TrimSeqsCommand::execute(){
318 if (abort == true) { if (calledHelp) { return 0; } return 2; }
320 numFPrimers = 0; //this needs to be initialized
322 vector<vector<string> > fastaFileNames;
323 vector<vector<string> > qualFileNames;
325 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
326 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
328 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
329 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
331 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
332 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
333 if (qFileName != "") {
334 outputNames.push_back(trimQualFile);
335 outputNames.push_back(scrapQualFile);
336 outputTypes["qual"].push_back(trimQualFile);
337 outputTypes["qual"].push_back(scrapQualFile);
340 string outputGroupFileName;
343 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
344 outputNames.push_back(outputGroupFileName); outputTypes["groups"].push_back(outputGroupFileName);
345 getOligos(fastaFileNames, qualFileNames);
348 vector<unsigned long int> fastaFilePos;
349 vector<unsigned long int> qFilePos;
351 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
353 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
354 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
355 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
357 if(qFileName == "") { qLines = lines; } //files with duds
359 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
361 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
363 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames);
366 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
369 if (m->control_pressed) { return 0; }
373 for(int i=0;i<fastaFileNames.size();i++){
374 for(int j=0;j<fastaFileNames[0].size();j++){
375 if(m->isBlank(fastaFileNames[i][j])){
376 remove(fastaFileNames[i][j].c_str());
379 remove(fastaFileNames[i][j].c_str());
389 if (m->control_pressed) {
390 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
394 m->mothurOutEndLine();
395 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
396 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
397 m->mothurOutEndLine();
402 catch(exception& e) {
403 m->errorOut(e, "TrimSeqsCommand", "execute");
408 /**************************************************************************************/
410 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {
414 ofstream trimFASTAFile;
415 m->openOutputFile(trimFileName, trimFASTAFile);
417 ofstream scrapFASTAFile;
418 m->openOutputFile(scrapFileName, scrapFASTAFile);
420 ofstream trimQualFile;
421 ofstream scrapQualFile;
423 m->openOutputFile(trimQFileName, trimQualFile);
424 m->openOutputFile(scrapQFileName, scrapQualFile);
427 ofstream outGroupsFile;
428 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
431 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
432 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
434 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
436 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
443 m->openInputFile(filename, inFASTA);
444 inFASTA.seekg(line->start);
447 if(qFileName != "") {
448 m->openInputFile(qFileName, qFile);
449 qFile.seekg(qline->start);
457 if (m->control_pressed) {
458 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
459 if (oligoFile != "") { outGroupsFile.close(); }
464 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
470 string trashCode = "";
471 int currentSeqsDiffs = 0;
473 Sequence currSeq(inFASTA); m->gobble(inFASTA);
475 QualityScores currQual;
477 currQual = QualityScores(qFile, currSeq.getNumBases()); m->gobble(qFile);
480 string origSeq = currSeq.getUnaligned();
483 int barcodeIndex = 0;
486 if(barcodes.size() != 0){
487 success = stripBarcode(currSeq, currQual, barcodeIndex);
488 if(success > bdiffs) { trashCode += 'b'; }
489 else{ currentSeqsDiffs += success; }
492 if(numFPrimers != 0){
493 success = stripForward(currSeq, currQual, primerIndex);
494 if(success > pdiffs) { trashCode += 'f'; }
495 else{ currentSeqsDiffs += success; }
498 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
500 if(numRPrimers != 0){
501 success = stripReverse(currSeq, currQual);
502 if(!success) { trashCode += 'r'; }
506 success = keepFirstTrim(currSeq, currQual);
510 success = removeLastTrim(currSeq, currQual);
511 if(!success) { trashCode += 'l'; }
516 int origLength = currSeq.getNumBases();
518 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
519 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
520 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
521 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
522 else { success = 1; }
524 //you don't want to trim, if it fails above then scrap it
525 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
527 if(!success) { trashCode += 'q'; }
530 if(minLength > 0 || maxLength > 0){
531 success = cullLength(currSeq);
532 if(!success) { trashCode += 'l'; }
535 success = cullHomoP(currSeq);
536 if(!success) { trashCode += 'h'; }
539 success = cullAmbigs(currSeq);
540 if(!success) { trashCode += 'n'; }
543 if(flip){ // should go last
544 currSeq.reverseComplement();
546 currQual.flipQScores();
550 if(trashCode.length() == 0){
551 currSeq.setAligned(currSeq.getUnaligned());
552 currSeq.printSequence(trimFASTAFile);
555 currQual.printQScores(trimQualFile);
558 if(barcodes.size() != 0){
559 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
565 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
566 currSeq.printSequence(output);
570 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
571 currQual.printQScores(output);
577 currSeq.setName(currSeq.getName() + '|' + trashCode);
578 currSeq.setUnaligned(origSeq);
579 currSeq.setAligned(origSeq);
580 currSeq.printSequence(scrapFASTAFile);
582 currQual.printQScores(scrapQualFile);
588 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
589 unsigned long int pos = inFASTA.tellg();
590 if ((pos == -1) || (pos >= line->end)) { break; }
592 if (inFASTA.eof()) { break; }
596 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
600 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
604 trimFASTAFile.close();
605 scrapFASTAFile.close();
606 if (oligoFile != "") { outGroupsFile.close(); }
607 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
611 catch(exception& e) {
612 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
617 /**************************************************************************************************/
619 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
621 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
626 //loop through and create all the processes you want
627 while (process != processors) {
631 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
635 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
636 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
641 for(int i=0;i<tempFASTAFileNames.size();i++){
642 for(int j=0;j<tempFASTAFileNames[i].size();j++){
643 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
644 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
647 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
648 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
654 driverCreateTrim(filename,
656 (trimFASTAFileName + toString(getpid()) + ".temp"),
657 (scrapFASTAFileName + toString(getpid()) + ".temp"),
658 (trimQualFileName + toString(getpid()) + ".temp"),
659 (scrapQualFileName + toString(getpid()) + ".temp"),
660 (groupFile + toString(getpid()) + ".temp"),
662 tempPrimerQualFileNames,
668 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
669 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
676 m->openOutputFile(trimFASTAFileName, temp); temp.close();
677 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
678 m->openOutputFile(trimQualFileName, temp); temp.close();
679 m->openOutputFile(scrapQualFileName, temp); temp.close();
683 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
686 //force parent to wait until all the processes are done
687 for (int i=0;i<processIDS.size();i++) {
688 int temp = processIDS[i];
693 for(int i=0;i<processIDS.size();i++){
695 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
697 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
698 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
699 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
700 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
703 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
704 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
705 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
706 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
709 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
710 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
714 for(int j=0;j<fastaFileNames.size();j++){
715 for(int k=0;k<fastaFileNames[j].size();k++){
716 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
717 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
720 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
721 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
732 catch(exception& e) {
733 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
738 /**************************************************************************************************/
740 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
743 //set file positions for fasta file
744 fastaFilePos = m->divideFile(filename, processors);
746 if (qfilename == "") { return processors; }
748 //get name of first sequence in each chunk
749 map<string, int> firstSeqNames;
750 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
752 m->openInputFile(filename, in);
753 in.seekg(fastaFilePos[i]);
756 firstSeqNames[temp.getName()] = i;
761 //seach for filePos of each first name in the qfile and save in qfileFilePos
763 m->openInputFile(qfilename, inQual);
766 while(!inQual.eof()){
767 input = m->getline(inQual);
769 if (input.length() != 0) {
770 if(input[0] == '>'){ //this is a sequence name line
771 istringstream nameStream(input);
773 string sname = ""; nameStream >> sname;
774 sname = sname.substr(1);
776 map<string, int>::iterator it = firstSeqNames.find(sname);
778 if(it != firstSeqNames.end()) { //this is the start of a new chunk
779 unsigned long int pos = inQual.tellg();
780 qfileFilePos.push_back(pos - input.length() - 1);
781 firstSeqNames.erase(it);
786 if (firstSeqNames.size() == 0) { break; }
791 if (firstSeqNames.size() != 0) {
792 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
793 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
799 //get last file position of qfile
801 unsigned long int size;
803 //get num bytes in file
804 pFile = fopen (qfilename.c_str(),"rb");
805 if (pFile==NULL) perror ("Error opening file");
807 fseek (pFile, 0, SEEK_END);
812 qfileFilePos.push_back(size);
816 catch(exception& e) {
817 m->errorOut(e, "TrimSeqsCommand", "setLines");
822 //***************************************************************************************************************
824 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
827 m->openInputFile(oligoFile, inOligos);
831 string type, oligo, group;
834 int indexBarcode = 0;
836 while(!inOligos.eof()){
838 inOligos >> type; m->gobble(inOligos);
841 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
844 //make type case insensitive
845 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
849 for(int i=0;i<oligo.length();i++){
850 oligo[i] = toupper(oligo[i]);
851 if(oligo[i] == 'U') { oligo[i] = 'T'; }
854 if(type == "FORWARD"){
857 // get rest of line in case there is a primer name
858 while (!inOligos.eof()) {
859 char c = inOligos.get();
860 if (c == 10 || c == 13){ break; }
861 else if (c == 32 || c == 9){;} //space or tab
865 //check for repeat barcodes
866 map<string, int>::iterator itPrime = primers.find(oligo);
867 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
869 primers[oligo]=indexPrimer; indexPrimer++;
870 primerNameVector.push_back(group);
872 else if(type == "REVERSE"){
873 Sequence oligoRC("reverse", oligo);
874 oligoRC.reverseComplement();
875 revPrimer.push_back(oligoRC.getUnaligned());
877 else if(type == "BARCODE"){
880 //check for repeat barcodes
881 map<string, int>::iterator itBar = barcodes.find(oligo);
882 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
884 barcodes[oligo]=indexBarcode; indexBarcode++;
885 barcodeNameVector.push_back(group);
887 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
893 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
895 //add in potential combos
896 if(barcodeNameVector.size() == 0){
898 barcodeNameVector.push_back("");
901 if(primerNameVector.size() == 0){
903 primerNameVector.push_back("");
906 fastaFileNames.resize(barcodeNameVector.size());
907 for(int i=0;i<fastaFileNames.size();i++){
908 fastaFileNames[i].assign(primerNameVector.size(), "");
910 if(qFileName != ""){ qualFileNames = fastaFileNames; }
913 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
914 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
916 string primerName = primerNameVector[itPrimer->second];
917 string barcodeName = barcodeNameVector[itBar->second];
919 string comboGroupName = "";
920 string fastaFileName = "";
921 string qualFileName = "";
923 if(primerName == ""){
924 comboGroupName = barcodeNameVector[itBar->second];
927 if(barcodeName == ""){
928 comboGroupName = primerNameVector[itPrimer->second];
931 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
936 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
937 outputNames.push_back(fastaFileName);
938 outputTypes["fasta"].push_back(fastaFileName);
939 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
940 m->openOutputFile(fastaFileName, temp); temp.close();
943 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
944 outputNames.push_back(qualFileName);
945 outputTypes["qual"].push_back(qualFileName);
946 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
947 m->openOutputFile(qualFileName, temp); temp.close();
952 numFPrimers = primers.size();
953 numRPrimers = revPrimer.size();
956 catch(exception& e) {
957 m->errorOut(e, "TrimSeqsCommand", "getOligos");
962 //***************************************************************************************************************
964 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
967 string rawSequence = seq.getUnaligned();
968 int success = bdiffs + 1; //guilty until proven innocent
970 //can you find the barcode
971 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
972 string oligo = it->first;
973 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
974 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
978 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
980 seq.setUnaligned(rawSequence.substr(oligo.length()));
982 if(qual.getName() != ""){
983 qual.trimQScores(oligo.length(), -1);
991 //if you found the barcode or if you don't want to allow for diffs
992 if ((bdiffs == 0) || (success == 0)) { return success; }
994 else { //try aligning and see if you can find it
998 Alignment* alignment;
999 if (barcodes.size() > 0) {
1000 map<string,int>::iterator it=barcodes.begin();
1002 for(it;it!=barcodes.end();it++){
1003 if(it->first.length() > maxLength){
1004 maxLength = it->first.length();
1007 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1009 }else{ alignment = NULL; }
1011 //can you find the barcode
1017 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1018 string oligo = it->first;
1019 // int length = oligo.length();
1021 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1022 success = bdiffs + 10;
1026 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1027 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1028 oligo = alignment->getSeqAAln();
1029 string temp = alignment->getSeqBAln();
1031 int alnLength = oligo.length();
1033 for(int i=oligo.length()-1;i>=0;i--){
1034 if(oligo[i] != '-'){ alnLength = i+1; break; }
1036 oligo = oligo.substr(0,alnLength);
1037 temp = temp.substr(0,alnLength);
1039 int numDiff = countDiffs(oligo, temp);
1041 if(numDiff < minDiff){
1044 minGroup = it->second;
1046 for(int i=0;i<alnLength;i++){
1052 else if(numDiff == minDiff){
1058 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1059 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1060 else{ //use the best match
1062 seq.setUnaligned(rawSequence.substr(minPos));
1064 if(qual.getName() != ""){
1065 qual.trimQScores(minPos, -1);
1070 if (alignment != NULL) { delete alignment; }
1077 catch(exception& e) {
1078 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1084 //***************************************************************************************************************
1086 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1088 string rawSequence = seq.getUnaligned();
1089 int success = pdiffs + 1; //guilty until proven innocent
1091 //can you find the primer
1092 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1093 string oligo = it->first;
1094 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1095 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1099 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1101 seq.setUnaligned(rawSequence.substr(oligo.length()));
1102 if(qual.getName() != ""){
1103 qual.trimQScores(oligo.length(), -1);
1110 //if you found the barcode or if you don't want to allow for diffs
1111 if ((pdiffs == 0) || (success == 0)) { return success; }
1113 else { //try aligning and see if you can find it
1117 Alignment* alignment;
1118 if (primers.size() > 0) {
1119 map<string,int>::iterator it=primers.begin();
1121 for(it;it!=primers.end();it++){
1122 if(it->first.length() > maxLength){
1123 maxLength = it->first.length();
1126 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1128 }else{ alignment = NULL; }
1130 //can you find the barcode
1136 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1137 string oligo = it->first;
1138 // int length = oligo.length();
1140 if(rawSequence.length() < maxLength){
1141 success = pdiffs + 100;
1145 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1146 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1147 oligo = alignment->getSeqAAln();
1148 string temp = alignment->getSeqBAln();
1150 int alnLength = oligo.length();
1152 for(int i=oligo.length()-1;i>=0;i--){
1153 if(oligo[i] != '-'){ alnLength = i+1; break; }
1155 oligo = oligo.substr(0,alnLength);
1156 temp = temp.substr(0,alnLength);
1158 int numDiff = countDiffs(oligo, temp);
1160 if(numDiff < minDiff){
1163 minGroup = it->second;
1165 for(int i=0;i<alnLength;i++){
1171 else if(numDiff == minDiff){
1177 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1178 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1179 else{ //use the best match
1181 seq.setUnaligned(rawSequence.substr(minPos));
1182 if(qual.getName() != ""){
1183 qual.trimQScores(minPos, -1);
1188 if (alignment != NULL) { delete alignment; }
1195 catch(exception& e) {
1196 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1201 //***************************************************************************************************************
1203 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1205 string rawSequence = seq.getUnaligned();
1206 bool success = 0; //guilty until proven innocent
1208 for(int i=0;i<numRPrimers;i++){
1209 string oligo = revPrimer[i];
1211 if(rawSequence.length() < oligo.length()){
1216 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1217 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1218 if(qual.getName() != ""){
1219 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1228 catch(exception& e) {
1229 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1234 //***************************************************************************************************************
1236 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1239 if(qscores.getName() != ""){
1240 qscores.trimQScores(-1, keepFirst);
1242 sequence.trim(keepFirst);
1245 catch(exception& e) {
1246 m->errorOut(e, "keepFirstTrim", "countDiffs");
1252 //***************************************************************************************************************
1254 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1258 int length = sequence.getNumBases() - removeLast;
1261 if(qscores.getName() != ""){
1262 qscores.trimQScores(-1, length);
1264 sequence.trim(length);
1273 catch(exception& e) {
1274 m->errorOut(e, "removeLastTrim", "countDiffs");
1280 //***************************************************************************************************************
1282 bool TrimSeqsCommand::cullLength(Sequence& seq){
1285 int length = seq.getNumBases();
1286 bool success = 0; //guilty until proven innocent
1288 if(length >= minLength && maxLength == 0) { success = 1; }
1289 else if(length >= minLength && length <= maxLength) { success = 1; }
1290 else { success = 0; }
1295 catch(exception& e) {
1296 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1302 //***************************************************************************************************************
1304 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1306 int longHomoP = seq.getLongHomoPolymer();
1307 bool success = 0; //guilty until proven innocent
1309 if(longHomoP <= maxHomoP){ success = 1; }
1310 else { success = 0; }
1314 catch(exception& e) {
1315 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1321 //***************************************************************************************************************
1323 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1325 int numNs = seq.getAmbigBases();
1326 bool success = 0; //guilty until proven innocent
1328 if(numNs <= maxAmbig) { success = 1; }
1329 else { success = 0; }
1333 catch(exception& e) {
1334 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1340 //***************************************************************************************************************
1342 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1345 int length = oligo.length();
1347 for(int i=0;i<length;i++){
1349 if(oligo[i] != seq[i]){
1350 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1351 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1352 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1353 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1354 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1355 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1356 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1357 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1358 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1359 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1360 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1361 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1363 if(success == 0) { break; }
1372 catch(exception& e) {
1373 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1379 //***************************************************************************************************************
1381 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1384 int length = oligo.length();
1387 for(int i=0;i<length;i++){
1389 if(oligo[i] != seq[i]){
1390 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1391 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1392 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1393 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1394 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1395 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1396 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1397 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1398 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1399 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1400 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1401 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1408 catch(exception& e) {
1409 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1415 //***************************************************************************************************************