5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::getValidParameters(){
17 string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile",
18 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19 "keepfirst", "removelast",
20 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
30 //**********************************************************************************************************************
32 TrimSeqsCommand::TrimSeqsCommand(){
34 abort = true; calledHelp = true;
35 vector<string> tempOutNames;
36 outputTypes["fasta"] = tempOutNames;
37 outputTypes["qfile"] = tempOutNames;
38 outputTypes["group"] = tempOutNames;
41 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
46 //**********************************************************************************************************************
48 vector<string> TrimSeqsCommand::getRequiredParameters(){
50 string Array[] = {"fasta"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
60 //**********************************************************************************************************************
62 vector<string> TrimSeqsCommand::getRequiredFiles(){
64 vector<string> myArray;
68 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
73 //***************************************************************************************************************
75 TrimSeqsCommand::TrimSeqsCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
85 //valid paramters for this command
86 string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile",
87 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
88 "keepfirst", "removelast",
89 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
91 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
93 OptionParser parser(option);
94 map<string,string> parameters = parser.getParameters();
96 ValidParameters validParameter;
97 map<string,string>::iterator it;
99 //check to make sure all parameters are valid for command
100 for (it = parameters.begin(); it != parameters.end(); it++) {
101 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
104 //initialize outputTypes
105 vector<string> tempOutNames;
106 outputTypes["fasta"] = tempOutNames;
107 outputTypes["qfile"] = tempOutNames;
108 outputTypes["group"] = tempOutNames;
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("fasta");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["fasta"] = inputDir + it->second; }
123 it = parameters.find("oligos");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["oligos"] = inputDir + it->second; }
131 it = parameters.find("qfile");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["qfile"] = inputDir + it->second; }
142 //check for required parameters
143 fastaFile = validParameter.validFile(parameters, "fasta", true);
144 if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
145 else if (fastaFile == "not open") { abort = true; }
147 //if the user changes the output directory command factory will send this info to us in the output parameter
148 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
150 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
154 //check for optional parameter and set defaults
155 // ...at some point should added some additional type checking...
157 temp = validParameter.validFile(parameters, "flip", false);
158 if (temp == "not found"){ flip = 0; }
159 else if(m->isTrue(temp)) { flip = 1; }
161 temp = validParameter.validFile(parameters, "oligos", true);
162 if (temp == "not found"){ oligoFile = ""; }
163 else if(temp == "not open"){ abort = true; }
164 else { oligoFile = temp; }
167 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
168 convert(temp, maxAmbig);
170 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
171 convert(temp, maxHomoP);
173 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
174 convert(temp, minLength);
176 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
177 convert(temp, maxLength);
179 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
180 convert(temp, bdiffs);
182 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
183 convert(temp, pdiffs);
185 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
186 convert(temp, tdiffs);
188 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
190 temp = validParameter.validFile(parameters, "qfile", true);
191 if (temp == "not found") { qFileName = ""; }
192 else if(temp == "not open") { abort = true; }
193 else { qFileName = temp; }
195 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
196 convert(temp, qThreshold);
198 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
199 qtrim = m->isTrue(temp);
201 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
202 convert(temp, qRollAverage);
204 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
205 convert(temp, qWindowAverage);
207 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
208 convert(temp, qWindowSize);
210 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
211 convert(temp, qWindowStep);
213 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
214 convert(temp, qAverage);
216 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
217 convert(temp, keepFirst);
219 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
220 convert(temp, removeLast);
222 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
223 allFiles = m->isTrue(temp);
225 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
226 convert(temp, processors);
229 if(allFiles && (oligoFile == "")){
230 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
232 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
233 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
237 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
238 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
244 catch(exception& e) {
245 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
250 //**********************************************************************************************************************
252 void TrimSeqsCommand::help(){
254 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
255 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
256 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
257 m->mothurOut("The fasta parameter is required.\n");
258 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
259 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
260 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
261 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
262 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
263 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
264 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
265 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
266 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
267 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
268 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
269 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
270 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
271 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
272 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
273 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
274 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
275 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
276 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
277 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
278 m->mothurOut("The trim.seqs command should be in the following format: \n");
279 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n");
280 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n");
281 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
282 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
283 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
286 catch(exception& e) {
287 m->errorOut(e, "TrimSeqsCommand", "help");
293 //***************************************************************************************************************
295 TrimSeqsCommand::~TrimSeqsCommand(){ /* do nothing */ }
297 //***************************************************************************************************************
299 int TrimSeqsCommand::execute(){
302 if (abort == true) { if (calledHelp) { return 0; } return 2; }
304 numFPrimers = 0; //this needs to be initialized
306 vector<vector<string> > fastaFileNames;
307 vector<vector<string> > qualFileNames;
309 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
310 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
312 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
313 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
315 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
316 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
317 if (qFileName != "") {
318 outputNames.push_back(trimQualFile);
319 outputNames.push_back(scrapQualFile);
320 outputTypes["qfile"].push_back(trimQualFile);
321 outputTypes["qfile"].push_back(scrapQualFile);
324 string outputGroupFileName;
326 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
327 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
328 getOligos(fastaFileNames, qualFileNames);
331 vector<unsigned long int> fastaFilePos;
332 vector<unsigned long int> qFilePos;
334 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
336 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
337 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
338 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
340 if(qFileName == "") { qLines = lines; } //files with duds
342 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
344 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
346 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames);
349 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
352 if (m->control_pressed) { return 0; }
355 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
356 map<string, string>::iterator it;
357 set<string> namesToRemove;
358 for(int i=0;i<fastaFileNames.size();i++){
359 for(int j=0;j<fastaFileNames[0].size();j++){
360 if (fastaFileNames[i][j] != "") {
361 if(m->isBlank(fastaFileNames[i][j])){
362 remove(fastaFileNames[i][j].c_str());
363 namesToRemove.insert(fastaFileNames[i][j]);
366 remove(qualFileNames[i][j].c_str());
367 namesToRemove.insert(qualFileNames[i][j]);
370 it = uniqueFastaNames.find(fastaFileNames[i][j]);
371 if (it == uniqueFastaNames.end()) {
372 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
379 //remove names for outputFileNames, just cleans up the output
380 vector<string> outputNames2;
381 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
382 outputNames = outputNames2;
384 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
386 m->openInputFile(it->first, in);
389 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
390 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
391 m->openOutputFile(thisGroupName, out);
394 if (m->control_pressed) { break; }
396 Sequence currSeq(in); m->gobble(in);
397 out << currSeq.getName() << '\t' << it->second << endl;
404 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
406 //output group counts
407 m->mothurOutEndLine();
409 for (int i = 0; i < barcodeNameVector.size(); i++) {
410 if ((barcodeNameVector[i] != "") && (groupCounts[i] != 0)) { total += groupCounts[i]; m->mothurOut("Group " + barcodeNameVector[i] + " contains " + toString(groupCounts[i]) + " sequences."); m->mothurOutEndLine(); }
412 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
414 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
416 m->mothurOutEndLine();
417 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
418 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
419 m->mothurOutEndLine();
424 catch(exception& e) {
425 m->errorOut(e, "TrimSeqsCommand", "execute");
430 /**************************************************************************************/
432 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {
436 ofstream trimFASTAFile;
437 m->openOutputFile(trimFileName, trimFASTAFile);
439 ofstream scrapFASTAFile;
440 m->openOutputFile(scrapFileName, scrapFASTAFile);
442 ofstream trimQualFile;
443 ofstream scrapQualFile;
445 m->openOutputFile(trimQFileName, trimQualFile);
446 m->openOutputFile(scrapQFileName, scrapQualFile);
449 ofstream outGroupsFile;
450 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
452 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
453 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
454 if (fastaFileNames[i][j] != "") {
456 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
458 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
466 m->openInputFile(filename, inFASTA);
467 inFASTA.seekg(line->start);
470 if(qFileName != "") {
471 m->openInputFile(qFileName, qFile);
472 qFile.seekg(qline->start);
480 if (m->control_pressed) {
481 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
482 if (oligoFile != "") { outGroupsFile.close(); }
487 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
493 string trashCode = "";
494 int currentSeqsDiffs = 0;
496 Sequence currSeq(inFASTA); m->gobble(inFASTA);
498 QualityScores currQual;
500 currQual = QualityScores(qFile); m->gobble(qFile);
503 string origSeq = currSeq.getUnaligned();
506 int barcodeIndex = 0;
509 if(barcodes.size() != 0){
510 success = stripBarcode(currSeq, currQual, barcodeIndex);
511 if(success > bdiffs) { trashCode += 'b'; }
512 else{ currentSeqsDiffs += success; }
515 if(numFPrimers != 0){
516 success = stripForward(currSeq, currQual, primerIndex);
517 if(success > pdiffs) { trashCode += 'f'; }
518 else{ currentSeqsDiffs += success; }
521 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
523 if(numRPrimers != 0){
524 success = stripReverse(currSeq, currQual);
525 if(!success) { trashCode += 'r'; }
529 success = keepFirstTrim(currSeq, currQual);
533 success = removeLastTrim(currSeq, currQual);
534 if(!success) { trashCode += 'l'; }
539 int origLength = currSeq.getNumBases();
541 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
542 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
543 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
544 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
545 else { success = 1; }
547 //you don't want to trim, if it fails above then scrap it
548 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
550 if(!success) { trashCode += 'q'; }
553 if(minLength > 0 || maxLength > 0){
554 success = cullLength(currSeq);
555 if(!success) { trashCode += 'l'; }
558 success = cullHomoP(currSeq);
559 if(!success) { trashCode += 'h'; }
562 success = cullAmbigs(currSeq);
563 if(!success) { trashCode += 'n'; }
566 if(flip){ // should go last
567 currSeq.reverseComplement();
569 currQual.flipQScores();
573 if(trashCode.length() == 0){
574 currSeq.setAligned(currSeq.getUnaligned());
575 currSeq.printSequence(trimFASTAFile);
578 currQual.printQScores(trimQualFile);
581 if(barcodes.size() != 0){
582 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
583 groupCounts[barcodeIndex]++;
589 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
590 currSeq.printSequence(output);
594 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
595 currQual.printQScores(output);
601 currSeq.setName(currSeq.getName() + '|' + trashCode);
602 currSeq.setUnaligned(origSeq);
603 currSeq.setAligned(origSeq);
604 currSeq.printSequence(scrapFASTAFile);
606 currQual.printQScores(scrapQualFile);
612 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
613 unsigned long int pos = inFASTA.tellg();
614 if ((pos == -1) || (pos >= line->end)) { break; }
616 if (inFASTA.eof()) { break; }
620 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
624 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
628 trimFASTAFile.close();
629 scrapFASTAFile.close();
630 if (oligoFile != "") { outGroupsFile.close(); }
631 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
635 catch(exception& e) {
636 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
641 /**************************************************************************************************/
643 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
645 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
650 //loop through and create all the processes you want
651 while (process != processors) {
655 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
659 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
660 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
665 for(int i=0;i<tempFASTAFileNames.size();i++){
666 for(int j=0;j<tempFASTAFileNames[i].size();j++){
667 if (tempFASTAFileNames[i][j] != "") {
668 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
669 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
672 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
673 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
680 driverCreateTrim(filename,
682 (trimFASTAFileName + toString(getpid()) + ".temp"),
683 (scrapFASTAFileName + toString(getpid()) + ".temp"),
684 (trimQualFileName + toString(getpid()) + ".temp"),
685 (scrapQualFileName + toString(getpid()) + ".temp"),
686 (groupFile + toString(getpid()) + ".temp"),
688 tempPrimerQualFileNames,
692 //pass groupCounts to parent
694 string tempFile = filename + toString(getpid()) + ".num.temp";
695 m->openOutputFile(tempFile, out);
696 for(int i = 0; i < groupCounts.size(); i++) {
697 out << groupCounts[i] << endl;
703 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
704 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
711 m->openOutputFile(trimFASTAFileName, temp); temp.close();
712 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
713 m->openOutputFile(trimQualFileName, temp); temp.close();
714 m->openOutputFile(scrapQualFileName, temp); temp.close();
716 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
718 //force parent to wait until all the processes are done
719 for (int i=0;i<processIDS.size();i++) {
720 int temp = processIDS[i];
725 for(int i=0;i<processIDS.size();i++){
727 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
729 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
730 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
731 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
732 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
735 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
736 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
737 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
738 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
741 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
742 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
746 for(int j=0;j<fastaFileNames.size();j++){
747 for(int k=0;k<fastaFileNames[j].size();k++){
748 if (fastaFileNames[j][k] != "") {
749 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
750 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
753 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
754 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
762 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
763 m->openInputFile(tempFile, in);
767 in >> tempNum; m->gobble(in);
768 groupCounts[count] += tempNum;
771 in.close(); remove(tempFile.c_str());
778 catch(exception& e) {
779 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
784 /**************************************************************************************************/
786 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
789 //set file positions for fasta file
790 fastaFilePos = m->divideFile(filename, processors);
792 if (qfilename == "") { return processors; }
794 //get name of first sequence in each chunk
795 map<string, int> firstSeqNames;
796 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
798 m->openInputFile(filename, in);
799 in.seekg(fastaFilePos[i]);
802 firstSeqNames[temp.getName()] = i;
807 //seach for filePos of each first name in the qfile and save in qfileFilePos
809 m->openInputFile(qfilename, inQual);
812 while(!inQual.eof()){
813 input = m->getline(inQual);
815 if (input.length() != 0) {
816 if(input[0] == '>'){ //this is a sequence name line
817 istringstream nameStream(input);
819 string sname = ""; nameStream >> sname;
820 sname = sname.substr(1);
822 map<string, int>::iterator it = firstSeqNames.find(sname);
824 if(it != firstSeqNames.end()) { //this is the start of a new chunk
825 unsigned long int pos = inQual.tellg();
826 qfileFilePos.push_back(pos - input.length() - 1);
827 firstSeqNames.erase(it);
832 if (firstSeqNames.size() == 0) { break; }
837 if (firstSeqNames.size() != 0) {
838 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
839 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
845 //get last file position of qfile
847 unsigned long int size;
849 //get num bytes in file
850 pFile = fopen (qfilename.c_str(),"rb");
851 if (pFile==NULL) perror ("Error opening file");
853 fseek (pFile, 0, SEEK_END);
858 qfileFilePos.push_back(size);
862 catch(exception& e) {
863 m->errorOut(e, "TrimSeqsCommand", "setLines");
868 //***************************************************************************************************************
870 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
873 m->openInputFile(oligoFile, inOligos);
877 string type, oligo, group;
880 int indexBarcode = 0;
882 while(!inOligos.eof()){
884 inOligos >> type; m->gobble(inOligos);
887 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
890 //make type case insensitive
891 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
895 for(int i=0;i<oligo.length();i++){
896 oligo[i] = toupper(oligo[i]);
897 if(oligo[i] == 'U') { oligo[i] = 'T'; }
900 if(type == "FORWARD"){
903 // get rest of line in case there is a primer name
904 while (!inOligos.eof()) {
905 char c = inOligos.get();
906 if (c == 10 || c == 13){ break; }
907 else if (c == 32 || c == 9){;} //space or tab
911 //check for repeat barcodes
912 map<string, int>::iterator itPrime = primers.find(oligo);
913 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
915 primers[oligo]=indexPrimer; indexPrimer++;
916 primerNameVector.push_back(group);
918 else if(type == "REVERSE"){
919 Sequence oligoRC("reverse", oligo);
920 oligoRC.reverseComplement();
921 revPrimer.push_back(oligoRC.getUnaligned());
923 else if(type == "BARCODE"){
926 //check for repeat barcodes
927 map<string, int>::iterator itBar = barcodes.find(oligo);
928 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
930 barcodes[oligo]=indexBarcode; indexBarcode++;
931 barcodeNameVector.push_back(group);
933 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
939 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
941 //add in potential combos
942 if(barcodeNameVector.size() == 0){
944 barcodeNameVector.push_back("");
947 if(primerNameVector.size() == 0){
949 primerNameVector.push_back("");
952 fastaFileNames.resize(barcodeNameVector.size());
953 for(int i=0;i<fastaFileNames.size();i++){
954 fastaFileNames[i].assign(primerNameVector.size(), "");
956 if(qFileName != ""){ qualFileNames = fastaFileNames; }
959 set<string> uniqueNames; //used to cleanup outputFileNames
960 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
961 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
963 string primerName = primerNameVector[itPrimer->second];
964 string barcodeName = barcodeNameVector[itBar->second];
966 string comboGroupName = "";
967 string fastaFileName = "";
968 string qualFileName = "";
970 if(primerName == ""){
971 comboGroupName = barcodeNameVector[itBar->second];
974 if(barcodeName == ""){
975 comboGroupName = primerNameVector[itPrimer->second];
978 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
983 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
984 if (uniqueNames.count(fastaFileName) == 0) {
985 outputNames.push_back(fastaFileName);
986 outputTypes["fasta"].push_back(fastaFileName);
987 uniqueNames.insert(fastaFileName);
990 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
991 m->openOutputFile(fastaFileName, temp); temp.close();
994 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
995 if (uniqueNames.count(fastaFileName) == 0) {
996 outputNames.push_back(qualFileName);
997 outputTypes["qfile"].push_back(qualFileName);
1000 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1001 m->openOutputFile(qualFileName, temp); temp.close();
1006 numFPrimers = primers.size();
1007 numRPrimers = revPrimer.size();
1008 groupCounts.resize(barcodeNameVector.size(), 0);
1011 catch(exception& e) {
1012 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1017 //***************************************************************************************************************
1019 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1022 string rawSequence = seq.getUnaligned();
1023 int success = bdiffs + 1; //guilty until proven innocent
1025 //can you find the barcode
1026 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1027 string oligo = it->first;
1028 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1029 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1033 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1035 seq.setUnaligned(rawSequence.substr(oligo.length()));
1037 if(qual.getName() != ""){
1038 qual.trimQScores(oligo.length(), -1);
1046 //if you found the barcode or if you don't want to allow for diffs
1047 if ((bdiffs == 0) || (success == 0)) { return success; }
1049 else { //try aligning and see if you can find it
1053 Alignment* alignment;
1054 if (barcodes.size() > 0) {
1055 map<string,int>::iterator it=barcodes.begin();
1057 for(it;it!=barcodes.end();it++){
1058 if(it->first.length() > maxLength){
1059 maxLength = it->first.length();
1062 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1064 }else{ alignment = NULL; }
1066 //can you find the barcode
1072 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1073 string oligo = it->first;
1074 // int length = oligo.length();
1076 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1077 success = bdiffs + 10;
1081 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1082 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1083 oligo = alignment->getSeqAAln();
1084 string temp = alignment->getSeqBAln();
1086 int alnLength = oligo.length();
1088 for(int i=oligo.length()-1;i>=0;i--){
1089 if(oligo[i] != '-'){ alnLength = i+1; break; }
1091 oligo = oligo.substr(0,alnLength);
1092 temp = temp.substr(0,alnLength);
1094 int numDiff = countDiffs(oligo, temp);
1096 if(numDiff < minDiff){
1099 minGroup = it->second;
1101 for(int i=0;i<alnLength;i++){
1107 else if(numDiff == minDiff){
1113 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1114 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1115 else{ //use the best match
1117 seq.setUnaligned(rawSequence.substr(minPos));
1119 if(qual.getName() != ""){
1120 qual.trimQScores(minPos, -1);
1125 if (alignment != NULL) { delete alignment; }
1132 catch(exception& e) {
1133 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1139 //***************************************************************************************************************
1141 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1143 string rawSequence = seq.getUnaligned();
1144 int success = pdiffs + 1; //guilty until proven innocent
1146 //can you find the primer
1147 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1148 string oligo = it->first;
1149 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1150 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1154 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1156 seq.setUnaligned(rawSequence.substr(oligo.length()));
1157 if(qual.getName() != ""){
1158 qual.trimQScores(oligo.length(), -1);
1165 //if you found the barcode or if you don't want to allow for diffs
1166 if ((pdiffs == 0) || (success == 0)) { return success; }
1168 else { //try aligning and see if you can find it
1172 Alignment* alignment;
1173 if (primers.size() > 0) {
1174 map<string,int>::iterator it=primers.begin();
1176 for(it;it!=primers.end();it++){
1177 if(it->first.length() > maxLength){
1178 maxLength = it->first.length();
1181 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1183 }else{ alignment = NULL; }
1185 //can you find the barcode
1191 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1192 string oligo = it->first;
1193 // int length = oligo.length();
1195 if(rawSequence.length() < maxLength){
1196 success = pdiffs + 100;
1200 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1201 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1202 oligo = alignment->getSeqAAln();
1203 string temp = alignment->getSeqBAln();
1205 int alnLength = oligo.length();
1207 for(int i=oligo.length()-1;i>=0;i--){
1208 if(oligo[i] != '-'){ alnLength = i+1; break; }
1210 oligo = oligo.substr(0,alnLength);
1211 temp = temp.substr(0,alnLength);
1213 int numDiff = countDiffs(oligo, temp);
1215 if(numDiff < minDiff){
1218 minGroup = it->second;
1220 for(int i=0;i<alnLength;i++){
1226 else if(numDiff == minDiff){
1232 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1233 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1234 else{ //use the best match
1236 seq.setUnaligned(rawSequence.substr(minPos));
1237 if(qual.getName() != ""){
1238 qual.trimQScores(minPos, -1);
1243 if (alignment != NULL) { delete alignment; }
1250 catch(exception& e) {
1251 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1256 //***************************************************************************************************************
1258 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1260 string rawSequence = seq.getUnaligned();
1261 bool success = 0; //guilty until proven innocent
1263 for(int i=0;i<numRPrimers;i++){
1264 string oligo = revPrimer[i];
1266 if(rawSequence.length() < oligo.length()){
1271 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1272 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1273 if(qual.getName() != ""){
1274 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1283 catch(exception& e) {
1284 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1289 //***************************************************************************************************************
1291 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1294 if(qscores.getName() != ""){
1295 qscores.trimQScores(-1, keepFirst);
1297 sequence.trim(keepFirst);
1300 catch(exception& e) {
1301 m->errorOut(e, "keepFirstTrim", "countDiffs");
1307 //***************************************************************************************************************
1309 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1313 int length = sequence.getNumBases() - removeLast;
1316 if(qscores.getName() != ""){
1317 qscores.trimQScores(-1, length);
1319 sequence.trim(length);
1328 catch(exception& e) {
1329 m->errorOut(e, "removeLastTrim", "countDiffs");
1335 //***************************************************************************************************************
1337 bool TrimSeqsCommand::cullLength(Sequence& seq){
1340 int length = seq.getNumBases();
1341 bool success = 0; //guilty until proven innocent
1343 if(length >= minLength && maxLength == 0) { success = 1; }
1344 else if(length >= minLength && length <= maxLength) { success = 1; }
1345 else { success = 0; }
1350 catch(exception& e) {
1351 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1357 //***************************************************************************************************************
1359 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1361 int longHomoP = seq.getLongHomoPolymer();
1362 bool success = 0; //guilty until proven innocent
1364 if(longHomoP <= maxHomoP){ success = 1; }
1365 else { success = 0; }
1369 catch(exception& e) {
1370 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1376 //***************************************************************************************************************
1378 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1380 int numNs = seq.getAmbigBases();
1381 bool success = 0; //guilty until proven innocent
1383 if(numNs <= maxAmbig) { success = 1; }
1384 else { success = 0; }
1388 catch(exception& e) {
1389 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1395 //***************************************************************************************************************
1397 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1400 int length = oligo.length();
1402 for(int i=0;i<length;i++){
1404 if(oligo[i] != seq[i]){
1405 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1406 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1407 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1408 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1409 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1410 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1411 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1412 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1413 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1414 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1415 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1416 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1418 if(success == 0) { break; }
1427 catch(exception& e) {
1428 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1434 //***************************************************************************************************************
1436 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1439 int length = oligo.length();
1442 for(int i=0;i<length;i++){
1444 if(oligo[i] != seq[i]){
1445 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1446 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1447 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1448 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1449 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1450 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1451 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1452 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1453 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1454 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1455 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1456 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1463 catch(exception& e) {
1464 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1470 //***************************************************************************************************************