5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::getValidParameters(){
17 string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile",
18 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19 "keepfirst", "removelast",
20 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
30 //**********************************************************************************************************************
32 TrimSeqsCommand::TrimSeqsCommand(){
34 abort = true; calledHelp = true;
35 vector<string> tempOutNames;
36 outputTypes["fasta"] = tempOutNames;
37 outputTypes["qfile"] = tempOutNames;
38 outputTypes["group"] = tempOutNames;
41 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
46 //**********************************************************************************************************************
48 vector<string> TrimSeqsCommand::getRequiredParameters(){
50 string Array[] = {"fasta"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
60 //**********************************************************************************************************************
62 vector<string> TrimSeqsCommand::getRequiredFiles(){
64 vector<string> myArray;
68 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
73 //***************************************************************************************************************
75 TrimSeqsCommand::TrimSeqsCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
85 //valid paramters for this command
86 string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile",
87 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
88 "keepfirst", "removelast",
89 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
91 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
93 OptionParser parser(option);
94 map<string,string> parameters = parser.getParameters();
96 ValidParameters validParameter;
97 map<string,string>::iterator it;
99 //check to make sure all parameters are valid for command
100 for (it = parameters.begin(); it != parameters.end(); it++) {
101 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
104 //initialize outputTypes
105 vector<string> tempOutNames;
106 outputTypes["fasta"] = tempOutNames;
107 outputTypes["qfile"] = tempOutNames;
108 outputTypes["group"] = tempOutNames;
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("fasta");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["fasta"] = inputDir + it->second; }
123 it = parameters.find("oligos");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["oligos"] = inputDir + it->second; }
131 it = parameters.find("qfile");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["qfile"] = inputDir + it->second; }
142 //check for required parameters
143 fastaFile = validParameter.validFile(parameters, "fasta", true);
144 if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
145 else if (fastaFile == "not open") { abort = true; }
147 //if the user changes the output directory command factory will send this info to us in the output parameter
148 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
150 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
154 //check for optional parameter and set defaults
155 // ...at some point should added some additional type checking...
157 temp = validParameter.validFile(parameters, "flip", false);
158 if (temp == "not found"){ flip = 0; }
159 else if(m->isTrue(temp)) { flip = 1; }
161 temp = validParameter.validFile(parameters, "oligos", true);
162 if (temp == "not found"){ oligoFile = ""; }
163 else if(temp == "not open"){ abort = true; }
164 else { oligoFile = temp; }
167 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
168 convert(temp, maxAmbig);
170 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
171 convert(temp, maxHomoP);
173 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
174 convert(temp, minLength);
176 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
177 convert(temp, maxLength);
179 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
180 convert(temp, bdiffs);
182 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
183 convert(temp, pdiffs);
185 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
186 convert(temp, tdiffs);
188 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
190 temp = validParameter.validFile(parameters, "qfile", true);
191 if (temp == "not found") { qFileName = ""; }
192 else if(temp == "not open") { abort = true; }
193 else { qFileName = temp; }
195 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
196 convert(temp, qThreshold);
198 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
199 qtrim = m->isTrue(temp);
201 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
202 convert(temp, qRollAverage);
204 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
205 convert(temp, qWindowAverage);
207 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
208 convert(temp, qWindowSize);
210 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
211 convert(temp, qWindowStep);
213 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
214 convert(temp, qAverage);
216 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
217 convert(temp, keepFirst);
219 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
220 convert(temp, removeLast);
222 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
223 allFiles = m->isTrue(temp);
225 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
226 convert(temp, processors);
229 if(allFiles && (oligoFile == "")){
230 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
232 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
233 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
237 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
238 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
244 catch(exception& e) {
245 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
250 //**********************************************************************************************************************
252 void TrimSeqsCommand::help(){
254 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
255 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
256 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
257 m->mothurOut("The fasta parameter is required.\n");
258 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
259 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
260 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
261 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
262 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
263 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
264 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
265 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
266 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
267 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
268 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
269 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
270 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
271 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
272 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
273 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
274 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
275 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
276 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
277 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
278 m->mothurOut("The trim.seqs command should be in the following format: \n");
279 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n");
280 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n");
281 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
282 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
283 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
286 catch(exception& e) {
287 m->errorOut(e, "TrimSeqsCommand", "help");
293 //***************************************************************************************************************
295 TrimSeqsCommand::~TrimSeqsCommand(){ /* do nothing */ }
297 //***************************************************************************************************************
299 int TrimSeqsCommand::execute(){
302 if (abort == true) { if (calledHelp) { return 0; } return 2; }
304 numFPrimers = 0; //this needs to be initialized
306 vector<vector<string> > fastaFileNames;
307 vector<vector<string> > qualFileNames;
309 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
310 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
312 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
313 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
315 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
316 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
317 if (qFileName != "") {
318 outputNames.push_back(trimQualFile);
319 outputNames.push_back(scrapQualFile);
320 outputTypes["qfile"].push_back(trimQualFile);
321 outputTypes["qfile"].push_back(scrapQualFile);
324 string outputGroupFileName;
326 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
327 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
328 getOligos(fastaFileNames, qualFileNames);
331 vector<unsigned long int> fastaFilePos;
332 vector<unsigned long int> qFilePos;
334 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
336 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
337 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
338 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
340 if(qFileName == "") { qLines = lines; } //files with duds
342 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
344 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
346 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames);
349 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
352 if (m->control_pressed) { return 0; }
355 //clear out all old group files
356 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
357 map<string, string>::iterator it;
358 set<string> namesToRemove;
359 for(int i=0;i<fastaFileNames.size();i++){
360 for(int j=0;j<fastaFileNames[0].size();j++){
361 if (fastaFileNames[i][j] != "") {
362 if(m->isBlank(fastaFileNames[i][j])){
363 remove(fastaFileNames[i][j].c_str());
364 namesToRemove.insert(fastaFileNames[i][j]);
367 remove(qualFileNames[i][j].c_str());
368 namesToRemove.insert(qualFileNames[i][j]);
371 it = uniqueFastaNames.find(fastaFileNames[i][j]);
372 if (it == uniqueFastaNames.end()) {
373 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
380 //remove names for outputFileNames, just cleans up the output
381 vector<string> outputNames2;
382 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
383 outputNames = outputNames2;
385 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
387 m->openInputFile(it->first, in);
390 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
391 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
392 m->openOutputFile(thisGroupName, out);
395 if (m->control_pressed) { break; }
397 Sequence currSeq(in); m->gobble(in);
398 out << currSeq.getName() << '\t' << it->second << endl;
405 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
407 //output group counts
408 m->mothurOutEndLine();
410 for (int i = 0; i < barcodeNameVector.size(); i++) {
411 if ((barcodeNameVector[i] != "") && (groupCounts[i] != 0)) { total += groupCounts[i]; m->mothurOut("Group " + barcodeNameVector[i] + " contains " + toString(groupCounts[i]) + " sequences."); m->mothurOutEndLine(); }
413 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
415 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
417 m->mothurOutEndLine();
418 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
419 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
420 m->mothurOutEndLine();
425 catch(exception& e) {
426 m->errorOut(e, "TrimSeqsCommand", "execute");
431 /**************************************************************************************/
433 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {
437 ofstream trimFASTAFile;
438 m->openOutputFile(trimFileName, trimFASTAFile);
440 ofstream scrapFASTAFile;
441 m->openOutputFile(scrapFileName, scrapFASTAFile);
443 ofstream trimQualFile;
444 ofstream scrapQualFile;
446 m->openOutputFile(trimQFileName, trimQualFile);
447 m->openOutputFile(scrapQFileName, scrapQualFile);
450 ofstream outGroupsFile;
451 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
453 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
454 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
455 if (fastaFileNames[i][j] != "") {
457 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
459 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
467 m->openInputFile(filename, inFASTA);
468 inFASTA.seekg(line->start);
471 if(qFileName != "") {
472 m->openInputFile(qFileName, qFile);
473 qFile.seekg(qline->start);
481 if (m->control_pressed) {
482 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
483 if (oligoFile != "") { outGroupsFile.close(); }
488 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
494 string trashCode = "";
495 int currentSeqsDiffs = 0;
497 Sequence currSeq(inFASTA); m->gobble(inFASTA);
499 QualityScores currQual;
501 currQual = QualityScores(qFile); m->gobble(qFile);
504 string origSeq = currSeq.getUnaligned();
507 int barcodeIndex = 0;
510 if(barcodes.size() != 0){
511 success = stripBarcode(currSeq, currQual, barcodeIndex);
512 if(success > bdiffs) { trashCode += 'b'; }
513 else{ currentSeqsDiffs += success; }
516 if(numFPrimers != 0){
517 success = stripForward(currSeq, currQual, primerIndex);
518 if(success > pdiffs) { trashCode += 'f'; }
519 else{ currentSeqsDiffs += success; }
522 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
524 if(numRPrimers != 0){
525 success = stripReverse(currSeq, currQual);
526 if(!success) { trashCode += 'r'; }
530 success = keepFirstTrim(currSeq, currQual);
534 success = removeLastTrim(currSeq, currQual);
535 if(!success) { trashCode += 'l'; }
540 int origLength = currSeq.getNumBases();
542 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
543 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
544 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
545 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
546 else { success = 1; }
548 //you don't want to trim, if it fails above then scrap it
549 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
551 if(!success) { trashCode += 'q'; }
554 if(minLength > 0 || maxLength > 0){
555 success = cullLength(currSeq);
556 if(!success) { trashCode += 'l'; }
559 success = cullHomoP(currSeq);
560 if(!success) { trashCode += 'h'; }
563 success = cullAmbigs(currSeq);
564 if(!success) { trashCode += 'n'; }
567 if(flip){ // should go last
568 currSeq.reverseComplement();
570 currQual.flipQScores();
574 if(trashCode.length() == 0){
575 currSeq.setAligned(currSeq.getUnaligned());
576 currSeq.printSequence(trimFASTAFile);
579 currQual.printQScores(trimQualFile);
582 if(barcodes.size() != 0){
583 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
584 groupCounts[barcodeIndex]++;
590 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
591 currSeq.printSequence(output);
595 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
596 currQual.printQScores(output);
602 currSeq.setName(currSeq.getName() + '|' + trashCode);
603 currSeq.setUnaligned(origSeq);
604 currSeq.setAligned(origSeq);
605 currSeq.printSequence(scrapFASTAFile);
607 currQual.printQScores(scrapQualFile);
613 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
614 unsigned long int pos = inFASTA.tellg();
615 if ((pos == -1) || (pos >= line->end)) { break; }
617 if (inFASTA.eof()) { break; }
621 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
625 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
629 trimFASTAFile.close();
630 scrapFASTAFile.close();
631 if (oligoFile != "") { outGroupsFile.close(); }
632 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
636 catch(exception& e) {
637 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
642 /**************************************************************************************************/
644 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
646 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
651 //loop through and create all the processes you want
652 while (process != processors) {
656 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
660 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
661 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
666 for(int i=0;i<tempFASTAFileNames.size();i++){
667 for(int j=0;j<tempFASTAFileNames[i].size();j++){
668 if (tempFASTAFileNames[i][j] != "") {
669 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
670 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
673 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
674 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
681 driverCreateTrim(filename,
683 (trimFASTAFileName + toString(getpid()) + ".temp"),
684 (scrapFASTAFileName + toString(getpid()) + ".temp"),
685 (trimQualFileName + toString(getpid()) + ".temp"),
686 (scrapQualFileName + toString(getpid()) + ".temp"),
687 (groupFile + toString(getpid()) + ".temp"),
689 tempPrimerQualFileNames,
693 //pass groupCounts to parent
695 string tempFile = filename + toString(getpid()) + ".num.temp";
696 m->openOutputFile(tempFile, out);
697 for(int i = 0; i < groupCounts.size(); i++) {
698 out << groupCounts[i] << endl;
704 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
705 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
712 m->openOutputFile(trimFASTAFileName, temp); temp.close();
713 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
714 m->openOutputFile(trimQualFileName, temp); temp.close();
715 m->openOutputFile(scrapQualFileName, temp); temp.close();
717 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
719 //force parent to wait until all the processes are done
720 for (int i=0;i<processIDS.size();i++) {
721 int temp = processIDS[i];
726 for(int i=0;i<processIDS.size();i++){
728 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
730 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
731 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
732 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
733 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
736 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
737 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
738 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
739 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
742 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
743 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
747 for(int j=0;j<fastaFileNames.size();j++){
748 for(int k=0;k<fastaFileNames[j].size();k++){
749 if (fastaFileNames[j][k] != "") {
750 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
751 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
754 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
755 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
763 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
764 m->openInputFile(tempFile, in);
768 in >> tempNum; m->gobble(in);
769 groupCounts[count] += tempNum;
772 in.close(); remove(tempFile.c_str());
779 catch(exception& e) {
780 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
785 /**************************************************************************************************/
787 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
790 //set file positions for fasta file
791 fastaFilePos = m->divideFile(filename, processors);
793 if (qfilename == "") { return processors; }
795 //get name of first sequence in each chunk
796 map<string, int> firstSeqNames;
797 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
799 m->openInputFile(filename, in);
800 in.seekg(fastaFilePos[i]);
803 firstSeqNames[temp.getName()] = i;
808 //seach for filePos of each first name in the qfile and save in qfileFilePos
810 m->openInputFile(qfilename, inQual);
813 while(!inQual.eof()){
814 input = m->getline(inQual);
816 if (input.length() != 0) {
817 if(input[0] == '>'){ //this is a sequence name line
818 istringstream nameStream(input);
820 string sname = ""; nameStream >> sname;
821 sname = sname.substr(1);
823 map<string, int>::iterator it = firstSeqNames.find(sname);
825 if(it != firstSeqNames.end()) { //this is the start of a new chunk
826 unsigned long int pos = inQual.tellg();
827 qfileFilePos.push_back(pos - input.length() - 1);
828 firstSeqNames.erase(it);
833 if (firstSeqNames.size() == 0) { break; }
838 if (firstSeqNames.size() != 0) {
839 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
840 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
846 //get last file position of qfile
848 unsigned long int size;
850 //get num bytes in file
851 pFile = fopen (qfilename.c_str(),"rb");
852 if (pFile==NULL) perror ("Error opening file");
854 fseek (pFile, 0, SEEK_END);
859 qfileFilePos.push_back(size);
863 catch(exception& e) {
864 m->errorOut(e, "TrimSeqsCommand", "setLines");
869 //***************************************************************************************************************
871 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
874 m->openInputFile(oligoFile, inOligos);
878 string type, oligo, group;
881 int indexBarcode = 0;
883 while(!inOligos.eof()){
885 inOligos >> type; m->gobble(inOligos);
888 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
891 //make type case insensitive
892 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
896 for(int i=0;i<oligo.length();i++){
897 oligo[i] = toupper(oligo[i]);
898 if(oligo[i] == 'U') { oligo[i] = 'T'; }
901 if(type == "FORWARD"){
904 // get rest of line in case there is a primer name
905 while (!inOligos.eof()) {
906 char c = inOligos.get();
907 if (c == 10 || c == 13){ break; }
908 else if (c == 32 || c == 9){;} //space or tab
912 //check for repeat barcodes
913 map<string, int>::iterator itPrime = primers.find(oligo);
914 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
916 primers[oligo]=indexPrimer; indexPrimer++;
917 primerNameVector.push_back(group);
919 else if(type == "REVERSE"){
920 Sequence oligoRC("reverse", oligo);
921 oligoRC.reverseComplement();
922 revPrimer.push_back(oligoRC.getUnaligned());
924 else if(type == "BARCODE"){
927 //check for repeat barcodes
928 map<string, int>::iterator itBar = barcodes.find(oligo);
929 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
931 barcodes[oligo]=indexBarcode; indexBarcode++;
932 barcodeNameVector.push_back(group);
934 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
940 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
942 //add in potential combos
943 if(barcodeNameVector.size() == 0){
945 barcodeNameVector.push_back("");
948 if(primerNameVector.size() == 0){
950 primerNameVector.push_back("");
953 fastaFileNames.resize(barcodeNameVector.size());
954 for(int i=0;i<fastaFileNames.size();i++){
955 fastaFileNames[i].assign(primerNameVector.size(), "");
957 if(qFileName != ""){ qualFileNames = fastaFileNames; }
960 set<string> uniqueNames; //used to cleanup outputFileNames
961 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
962 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
964 string primerName = primerNameVector[itPrimer->second];
965 string barcodeName = barcodeNameVector[itBar->second];
967 string comboGroupName = "";
968 string fastaFileName = "";
969 string qualFileName = "";
971 if(primerName == ""){
972 comboGroupName = barcodeNameVector[itBar->second];
975 if(barcodeName == ""){
976 comboGroupName = primerNameVector[itPrimer->second];
979 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
984 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
985 if (uniqueNames.count(fastaFileName) == 0) {
986 outputNames.push_back(fastaFileName);
987 outputTypes["fasta"].push_back(fastaFileName);
988 uniqueNames.insert(fastaFileName);
991 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
992 m->openOutputFile(fastaFileName, temp); temp.close();
995 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
996 if (uniqueNames.count(fastaFileName) == 0) {
997 outputNames.push_back(qualFileName);
998 outputTypes["qfile"].push_back(qualFileName);
1001 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1002 m->openOutputFile(qualFileName, temp); temp.close();
1007 numFPrimers = primers.size();
1008 numRPrimers = revPrimer.size();
1009 groupCounts.resize(barcodeNameVector.size(), 0);
1012 catch(exception& e) {
1013 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1018 //***************************************************************************************************************
1020 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1023 string rawSequence = seq.getUnaligned();
1024 int success = bdiffs + 1; //guilty until proven innocent
1026 //can you find the barcode
1027 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1028 string oligo = it->first;
1029 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1030 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1034 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1036 seq.setUnaligned(rawSequence.substr(oligo.length()));
1038 if(qual.getName() != ""){
1039 qual.trimQScores(oligo.length(), -1);
1047 //if you found the barcode or if you don't want to allow for diffs
1048 if ((bdiffs == 0) || (success == 0)) { return success; }
1050 else { //try aligning and see if you can find it
1054 Alignment* alignment;
1055 if (barcodes.size() > 0) {
1056 map<string,int>::iterator it=barcodes.begin();
1058 for(it;it!=barcodes.end();it++){
1059 if(it->first.length() > maxLength){
1060 maxLength = it->first.length();
1063 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1065 }else{ alignment = NULL; }
1067 //can you find the barcode
1073 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1074 string oligo = it->first;
1075 // int length = oligo.length();
1077 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1078 success = bdiffs + 10;
1082 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1083 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1084 oligo = alignment->getSeqAAln();
1085 string temp = alignment->getSeqBAln();
1087 int alnLength = oligo.length();
1089 for(int i=oligo.length()-1;i>=0;i--){
1090 if(oligo[i] != '-'){ alnLength = i+1; break; }
1092 oligo = oligo.substr(0,alnLength);
1093 temp = temp.substr(0,alnLength);
1095 int numDiff = countDiffs(oligo, temp);
1097 if(numDiff < minDiff){
1100 minGroup = it->second;
1102 for(int i=0;i<alnLength;i++){
1108 else if(numDiff == minDiff){
1114 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1115 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1116 else{ //use the best match
1118 seq.setUnaligned(rawSequence.substr(minPos));
1120 if(qual.getName() != ""){
1121 qual.trimQScores(minPos, -1);
1126 if (alignment != NULL) { delete alignment; }
1133 catch(exception& e) {
1134 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1140 //***************************************************************************************************************
1142 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1144 string rawSequence = seq.getUnaligned();
1145 int success = pdiffs + 1; //guilty until proven innocent
1147 //can you find the primer
1148 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1149 string oligo = it->first;
1150 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1151 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1155 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1157 seq.setUnaligned(rawSequence.substr(oligo.length()));
1158 if(qual.getName() != ""){
1159 qual.trimQScores(oligo.length(), -1);
1166 //if you found the barcode or if you don't want to allow for diffs
1167 if ((pdiffs == 0) || (success == 0)) { return success; }
1169 else { //try aligning and see if you can find it
1173 Alignment* alignment;
1174 if (primers.size() > 0) {
1175 map<string,int>::iterator it=primers.begin();
1177 for(it;it!=primers.end();it++){
1178 if(it->first.length() > maxLength){
1179 maxLength = it->first.length();
1182 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1184 }else{ alignment = NULL; }
1186 //can you find the barcode
1192 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1193 string oligo = it->first;
1194 // int length = oligo.length();
1196 if(rawSequence.length() < maxLength){
1197 success = pdiffs + 100;
1201 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1202 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1203 oligo = alignment->getSeqAAln();
1204 string temp = alignment->getSeqBAln();
1206 int alnLength = oligo.length();
1208 for(int i=oligo.length()-1;i>=0;i--){
1209 if(oligo[i] != '-'){ alnLength = i+1; break; }
1211 oligo = oligo.substr(0,alnLength);
1212 temp = temp.substr(0,alnLength);
1214 int numDiff = countDiffs(oligo, temp);
1216 if(numDiff < minDiff){
1219 minGroup = it->second;
1221 for(int i=0;i<alnLength;i++){
1227 else if(numDiff == minDiff){
1233 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1234 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1235 else{ //use the best match
1237 seq.setUnaligned(rawSequence.substr(minPos));
1238 if(qual.getName() != ""){
1239 qual.trimQScores(minPos, -1);
1244 if (alignment != NULL) { delete alignment; }
1251 catch(exception& e) {
1252 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1257 //***************************************************************************************************************
1259 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1261 string rawSequence = seq.getUnaligned();
1262 bool success = 0; //guilty until proven innocent
1264 for(int i=0;i<numRPrimers;i++){
1265 string oligo = revPrimer[i];
1267 if(rawSequence.length() < oligo.length()){
1272 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1273 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1274 if(qual.getName() != ""){
1275 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1284 catch(exception& e) {
1285 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1290 //***************************************************************************************************************
1292 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1295 if(qscores.getName() != ""){
1296 qscores.trimQScores(-1, keepFirst);
1298 sequence.trim(keepFirst);
1301 catch(exception& e) {
1302 m->errorOut(e, "keepFirstTrim", "countDiffs");
1308 //***************************************************************************************************************
1310 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1314 int length = sequence.getNumBases() - removeLast;
1317 if(qscores.getName() != ""){
1318 qscores.trimQScores(-1, length);
1320 sequence.trim(length);
1329 catch(exception& e) {
1330 m->errorOut(e, "removeLastTrim", "countDiffs");
1336 //***************************************************************************************************************
1338 bool TrimSeqsCommand::cullLength(Sequence& seq){
1341 int length = seq.getNumBases();
1342 bool success = 0; //guilty until proven innocent
1344 if(length >= minLength && maxLength == 0) { success = 1; }
1345 else if(length >= minLength && length <= maxLength) { success = 1; }
1346 else { success = 0; }
1351 catch(exception& e) {
1352 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1358 //***************************************************************************************************************
1360 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1362 int longHomoP = seq.getLongHomoPolymer();
1363 bool success = 0; //guilty until proven innocent
1365 if(longHomoP <= maxHomoP){ success = 1; }
1366 else { success = 0; }
1370 catch(exception& e) {
1371 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1377 //***************************************************************************************************************
1379 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1381 int numNs = seq.getAmbigBases();
1382 bool success = 0; //guilty until proven innocent
1384 if(numNs <= maxAmbig) { success = 1; }
1385 else { success = 0; }
1389 catch(exception& e) {
1390 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1396 //***************************************************************************************************************
1398 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1401 int length = oligo.length();
1403 for(int i=0;i<length;i++){
1405 if(oligo[i] != seq[i]){
1406 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1407 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1408 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1409 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1410 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1411 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1412 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1413 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1414 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1415 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1416 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1417 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1419 if(success == 0) { break; }
1428 catch(exception& e) {
1429 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1435 //***************************************************************************************************************
1437 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1440 int length = oligo.length();
1443 for(int i=0;i<length;i++){
1445 if(oligo[i] != seq[i]){
1446 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1447 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1448 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1449 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1450 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1451 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1452 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1453 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1454 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1455 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1456 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1457 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1464 catch(exception& e) {
1465 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1471 //***************************************************************************************************************