5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::getValidParameters(){
17 string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile",
18 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19 "keepfirst", "removelast",
20 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
30 //**********************************************************************************************************************
32 TrimSeqsCommand::TrimSeqsCommand(){
34 abort = true; calledHelp = true;
35 vector<string> tempOutNames;
36 outputTypes["fasta"] = tempOutNames;
37 outputTypes["qfile"] = tempOutNames;
38 outputTypes["group"] = tempOutNames;
41 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
46 //**********************************************************************************************************************
48 vector<string> TrimSeqsCommand::getRequiredParameters(){
50 string Array[] = {"fasta"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
60 //**********************************************************************************************************************
62 vector<string> TrimSeqsCommand::getRequiredFiles(){
64 vector<string> myArray;
68 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
73 //***************************************************************************************************************
75 TrimSeqsCommand::TrimSeqsCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
85 //valid paramters for this command
86 string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile",
87 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
88 "keepfirst", "removelast",
89 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
91 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
93 OptionParser parser(option);
94 map<string,string> parameters = parser.getParameters();
96 ValidParameters validParameter;
97 map<string,string>::iterator it;
99 //check to make sure all parameters are valid for command
100 for (it = parameters.begin(); it != parameters.end(); it++) {
101 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
104 //initialize outputTypes
105 vector<string> tempOutNames;
106 outputTypes["fasta"] = tempOutNames;
107 outputTypes["qfile"] = tempOutNames;
108 outputTypes["group"] = tempOutNames;
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("fasta");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["fasta"] = inputDir + it->second; }
123 it = parameters.find("oligos");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["oligos"] = inputDir + it->second; }
131 it = parameters.find("qfile");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["qfile"] = inputDir + it->second; }
142 //check for required parameters
143 fastaFile = validParameter.validFile(parameters, "fasta", true);
144 if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
145 else if (fastaFile == "not open") { abort = true; }
147 //if the user changes the output directory command factory will send this info to us in the output parameter
148 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
150 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
154 //check for optional parameter and set defaults
155 // ...at some point should added some additional type checking...
157 temp = validParameter.validFile(parameters, "flip", false);
158 if (temp == "not found"){ flip = 0; }
159 else if(m->isTrue(temp)) { flip = 1; }
161 temp = validParameter.validFile(parameters, "oligos", true);
162 if (temp == "not found"){ oligoFile = ""; }
163 else if(temp == "not open"){ abort = true; }
164 else { oligoFile = temp; }
167 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
168 convert(temp, maxAmbig);
170 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
171 convert(temp, maxHomoP);
173 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
174 convert(temp, minLength);
176 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
177 convert(temp, maxLength);
179 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
180 convert(temp, bdiffs);
182 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
183 convert(temp, pdiffs);
185 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
186 convert(temp, tdiffs);
188 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
190 temp = validParameter.validFile(parameters, "qfile", true);
191 if (temp == "not found") { qFileName = ""; }
192 else if(temp == "not open") { abort = true; }
193 else { qFileName = temp; }
195 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
196 convert(temp, qThreshold);
198 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
199 qtrim = m->isTrue(temp);
201 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
202 convert(temp, qRollAverage);
204 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
205 convert(temp, qWindowAverage);
207 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
208 convert(temp, qWindowSize);
210 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
211 convert(temp, qWindowStep);
213 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
214 convert(temp, qAverage);
216 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
217 convert(temp, keepFirst);
219 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
220 convert(temp, removeLast);
222 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
223 allFiles = m->isTrue(temp);
225 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
226 convert(temp, processors);
229 if(allFiles && (oligoFile == "")){
230 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
232 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
233 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
237 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
238 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
244 catch(exception& e) {
245 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
250 //**********************************************************************************************************************
252 void TrimSeqsCommand::help(){
254 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
255 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
256 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
257 m->mothurOut("The fasta parameter is required.\n");
258 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
259 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
260 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
261 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
262 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
263 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
264 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
265 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
266 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
267 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
268 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
269 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
270 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
271 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
272 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
273 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
274 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
275 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
276 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
277 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
278 m->mothurOut("The trim.seqs command should be in the following format: \n");
279 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n");
280 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n");
281 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
282 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
283 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
286 catch(exception& e) {
287 m->errorOut(e, "TrimSeqsCommand", "help");
293 //***************************************************************************************************************
295 TrimSeqsCommand::~TrimSeqsCommand(){ /* do nothing */ }
297 //***************************************************************************************************************
299 int TrimSeqsCommand::execute(){
302 if (abort == true) { if (calledHelp) { return 0; } return 2; }
304 numFPrimers = 0; //this needs to be initialized
306 vector<vector<string> > fastaFileNames;
307 vector<vector<string> > qualFileNames;
309 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
310 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
312 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
313 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
315 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
316 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
317 if (qFileName != "") {
318 outputNames.push_back(trimQualFile);
319 outputNames.push_back(scrapQualFile);
320 outputTypes["qfile"].push_back(trimQualFile);
321 outputTypes["qfile"].push_back(scrapQualFile);
324 string outputGroupFileName;
327 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
328 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
329 getOligos(fastaFileNames, qualFileNames);
332 vector<unsigned long int> fastaFilePos;
333 vector<unsigned long int> qFilePos;
335 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
337 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
338 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
339 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
341 if(qFileName == "") { qLines = lines; } //files with duds
343 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
345 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
347 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames);
350 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
353 if (m->control_pressed) { return 0; }
357 for(int i=0;i<fastaFileNames.size();i++){
358 for(int j=0;j<fastaFileNames[0].size();j++){
359 if(m->isBlank(fastaFileNames[i][j])){
360 remove(fastaFileNames[i][j].c_str());
363 remove(fastaFileNames[i][j].c_str());
373 if (m->control_pressed) {
374 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
378 m->mothurOutEndLine();
379 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
380 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
381 m->mothurOutEndLine();
386 catch(exception& e) {
387 m->errorOut(e, "TrimSeqsCommand", "execute");
392 /**************************************************************************************/
394 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {
398 ofstream trimFASTAFile;
399 m->openOutputFile(trimFileName, trimFASTAFile);
401 ofstream scrapFASTAFile;
402 m->openOutputFile(scrapFileName, scrapFASTAFile);
404 ofstream trimQualFile;
405 ofstream scrapQualFile;
407 m->openOutputFile(trimQFileName, trimQualFile);
408 m->openOutputFile(scrapQFileName, scrapQualFile);
411 ofstream outGroupsFile;
412 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
415 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
416 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
418 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
420 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
427 m->openInputFile(filename, inFASTA);
428 inFASTA.seekg(line->start);
431 if(qFileName != "") {
432 m->openInputFile(qFileName, qFile);
433 qFile.seekg(qline->start);
441 if (m->control_pressed) {
442 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
443 if (oligoFile != "") { outGroupsFile.close(); }
448 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
454 string trashCode = "";
455 int currentSeqsDiffs = 0;
457 Sequence currSeq(inFASTA); m->gobble(inFASTA);
459 QualityScores currQual;
461 currQual = QualityScores(qFile); m->gobble(qFile);
464 string origSeq = currSeq.getUnaligned();
467 int barcodeIndex = 0;
470 if(barcodes.size() != 0){
471 success = stripBarcode(currSeq, currQual, barcodeIndex);
472 if(success > bdiffs) { trashCode += 'b'; }
473 else{ currentSeqsDiffs += success; }
476 if(numFPrimers != 0){
477 success = stripForward(currSeq, currQual, primerIndex);
478 if(success > pdiffs) { trashCode += 'f'; }
479 else{ currentSeqsDiffs += success; }
482 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
484 if(numRPrimers != 0){
485 success = stripReverse(currSeq, currQual);
486 if(!success) { trashCode += 'r'; }
490 success = keepFirstTrim(currSeq, currQual);
494 success = removeLastTrim(currSeq, currQual);
495 if(!success) { trashCode += 'l'; }
500 int origLength = currSeq.getNumBases();
502 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
503 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
504 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
505 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
506 else { success = 1; }
508 //you don't want to trim, if it fails above then scrap it
509 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
511 if(!success) { trashCode += 'q'; }
514 if(minLength > 0 || maxLength > 0){
515 success = cullLength(currSeq);
516 if(!success) { trashCode += 'l'; }
519 success = cullHomoP(currSeq);
520 if(!success) { trashCode += 'h'; }
523 success = cullAmbigs(currSeq);
524 if(!success) { trashCode += 'n'; }
527 if(flip){ // should go last
528 currSeq.reverseComplement();
530 currQual.flipQScores();
534 if(trashCode.length() == 0){
535 currSeq.setAligned(currSeq.getUnaligned());
536 currSeq.printSequence(trimFASTAFile);
539 currQual.printQScores(trimQualFile);
542 if(barcodes.size() != 0){
543 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
549 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
550 currSeq.printSequence(output);
554 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
555 currQual.printQScores(output);
561 currSeq.setName(currSeq.getName() + '|' + trashCode);
562 currSeq.setUnaligned(origSeq);
563 currSeq.setAligned(origSeq);
564 currSeq.printSequence(scrapFASTAFile);
566 currQual.printQScores(scrapQualFile);
572 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
573 unsigned long int pos = inFASTA.tellg();
574 if ((pos == -1) || (pos >= line->end)) { break; }
576 if (inFASTA.eof()) { break; }
580 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
584 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
588 trimFASTAFile.close();
589 scrapFASTAFile.close();
590 if (oligoFile != "") { outGroupsFile.close(); }
591 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
595 catch(exception& e) {
596 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
601 /**************************************************************************************************/
603 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
605 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
610 //loop through and create all the processes you want
611 while (process != processors) {
615 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
619 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
620 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
625 for(int i=0;i<tempFASTAFileNames.size();i++){
626 for(int j=0;j<tempFASTAFileNames[i].size();j++){
627 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
628 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
631 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
632 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
638 driverCreateTrim(filename,
640 (trimFASTAFileName + toString(getpid()) + ".temp"),
641 (scrapFASTAFileName + toString(getpid()) + ".temp"),
642 (trimQualFileName + toString(getpid()) + ".temp"),
643 (scrapQualFileName + toString(getpid()) + ".temp"),
644 (groupFile + toString(getpid()) + ".temp"),
646 tempPrimerQualFileNames,
652 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
653 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
660 m->openOutputFile(trimFASTAFileName, temp); temp.close();
661 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
662 m->openOutputFile(trimQualFileName, temp); temp.close();
663 m->openOutputFile(scrapQualFileName, temp); temp.close();
667 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
670 //force parent to wait until all the processes are done
671 for (int i=0;i<processIDS.size();i++) {
672 int temp = processIDS[i];
677 for(int i=0;i<processIDS.size();i++){
679 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
681 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
682 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
683 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
684 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
687 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
688 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
689 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
690 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
693 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
694 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
698 for(int j=0;j<fastaFileNames.size();j++){
699 for(int k=0;k<fastaFileNames[j].size();k++){
700 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
701 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
704 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
705 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
716 catch(exception& e) {
717 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
722 /**************************************************************************************************/
724 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
727 //set file positions for fasta file
728 fastaFilePos = m->divideFile(filename, processors);
730 if (qfilename == "") { return processors; }
732 //get name of first sequence in each chunk
733 map<string, int> firstSeqNames;
734 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
736 m->openInputFile(filename, in);
737 in.seekg(fastaFilePos[i]);
740 firstSeqNames[temp.getName()] = i;
745 //seach for filePos of each first name in the qfile and save in qfileFilePos
747 m->openInputFile(qfilename, inQual);
750 while(!inQual.eof()){
751 input = m->getline(inQual);
753 if (input.length() != 0) {
754 if(input[0] == '>'){ //this is a sequence name line
755 istringstream nameStream(input);
757 string sname = ""; nameStream >> sname;
758 sname = sname.substr(1);
760 map<string, int>::iterator it = firstSeqNames.find(sname);
762 if(it != firstSeqNames.end()) { //this is the start of a new chunk
763 unsigned long int pos = inQual.tellg();
764 qfileFilePos.push_back(pos - input.length() - 1);
765 firstSeqNames.erase(it);
770 if (firstSeqNames.size() == 0) { break; }
775 if (firstSeqNames.size() != 0) {
776 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
777 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
783 //get last file position of qfile
785 unsigned long int size;
787 //get num bytes in file
788 pFile = fopen (qfilename.c_str(),"rb");
789 if (pFile==NULL) perror ("Error opening file");
791 fseek (pFile, 0, SEEK_END);
796 qfileFilePos.push_back(size);
800 catch(exception& e) {
801 m->errorOut(e, "TrimSeqsCommand", "setLines");
806 //***************************************************************************************************************
808 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
811 m->openInputFile(oligoFile, inOligos);
815 string type, oligo, group;
818 int indexBarcode = 0;
820 while(!inOligos.eof()){
822 inOligos >> type; m->gobble(inOligos);
825 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
828 //make type case insensitive
829 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
833 for(int i=0;i<oligo.length();i++){
834 oligo[i] = toupper(oligo[i]);
835 if(oligo[i] == 'U') { oligo[i] = 'T'; }
838 if(type == "FORWARD"){
841 // get rest of line in case there is a primer name
842 while (!inOligos.eof()) {
843 char c = inOligos.get();
844 if (c == 10 || c == 13){ break; }
845 else if (c == 32 || c == 9){;} //space or tab
849 //check for repeat barcodes
850 map<string, int>::iterator itPrime = primers.find(oligo);
851 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
853 primers[oligo]=indexPrimer; indexPrimer++;
854 primerNameVector.push_back(group);
856 else if(type == "REVERSE"){
857 Sequence oligoRC("reverse", oligo);
858 oligoRC.reverseComplement();
859 revPrimer.push_back(oligoRC.getUnaligned());
861 else if(type == "BARCODE"){
864 //check for repeat barcodes
865 map<string, int>::iterator itBar = barcodes.find(oligo);
866 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
868 barcodes[oligo]=indexBarcode; indexBarcode++;
869 barcodeNameVector.push_back(group);
871 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
877 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
879 //add in potential combos
880 if(barcodeNameVector.size() == 0){
882 barcodeNameVector.push_back("");
885 if(primerNameVector.size() == 0){
887 primerNameVector.push_back("");
890 fastaFileNames.resize(barcodeNameVector.size());
891 for(int i=0;i<fastaFileNames.size();i++){
892 fastaFileNames[i].assign(primerNameVector.size(), "");
894 if(qFileName != ""){ qualFileNames = fastaFileNames; }
897 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
898 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
900 string primerName = primerNameVector[itPrimer->second];
901 string barcodeName = barcodeNameVector[itBar->second];
903 string comboGroupName = "";
904 string fastaFileName = "";
905 string qualFileName = "";
907 if(primerName == ""){
908 comboGroupName = barcodeNameVector[itBar->second];
911 if(barcodeName == ""){
912 comboGroupName = primerNameVector[itPrimer->second];
915 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
920 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
921 outputNames.push_back(fastaFileName);
922 outputTypes["fasta"].push_back(fastaFileName);
923 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
924 m->openOutputFile(fastaFileName, temp); temp.close();
927 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
928 outputNames.push_back(qualFileName);
929 outputTypes["qfile"].push_back(qualFileName);
930 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
931 m->openOutputFile(qualFileName, temp); temp.close();
936 numFPrimers = primers.size();
937 numRPrimers = revPrimer.size();
940 catch(exception& e) {
941 m->errorOut(e, "TrimSeqsCommand", "getOligos");
946 //***************************************************************************************************************
948 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
951 string rawSequence = seq.getUnaligned();
952 int success = bdiffs + 1; //guilty until proven innocent
954 //can you find the barcode
955 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
956 string oligo = it->first;
957 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
958 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
962 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
964 seq.setUnaligned(rawSequence.substr(oligo.length()));
966 if(qual.getName() != ""){
967 qual.trimQScores(oligo.length(), -1);
975 //if you found the barcode or if you don't want to allow for diffs
976 if ((bdiffs == 0) || (success == 0)) { return success; }
978 else { //try aligning and see if you can find it
982 Alignment* alignment;
983 if (barcodes.size() > 0) {
984 map<string,int>::iterator it=barcodes.begin();
986 for(it;it!=barcodes.end();it++){
987 if(it->first.length() > maxLength){
988 maxLength = it->first.length();
991 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
993 }else{ alignment = NULL; }
995 //can you find the barcode
1001 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1002 string oligo = it->first;
1003 // int length = oligo.length();
1005 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1006 success = bdiffs + 10;
1010 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1011 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1012 oligo = alignment->getSeqAAln();
1013 string temp = alignment->getSeqBAln();
1015 int alnLength = oligo.length();
1017 for(int i=oligo.length()-1;i>=0;i--){
1018 if(oligo[i] != '-'){ alnLength = i+1; break; }
1020 oligo = oligo.substr(0,alnLength);
1021 temp = temp.substr(0,alnLength);
1023 int numDiff = countDiffs(oligo, temp);
1025 if(numDiff < minDiff){
1028 minGroup = it->second;
1030 for(int i=0;i<alnLength;i++){
1036 else if(numDiff == minDiff){
1042 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1043 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1044 else{ //use the best match
1046 seq.setUnaligned(rawSequence.substr(minPos));
1048 if(qual.getName() != ""){
1049 qual.trimQScores(minPos, -1);
1054 if (alignment != NULL) { delete alignment; }
1061 catch(exception& e) {
1062 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1068 //***************************************************************************************************************
1070 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1072 string rawSequence = seq.getUnaligned();
1073 int success = pdiffs + 1; //guilty until proven innocent
1075 //can you find the primer
1076 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1077 string oligo = it->first;
1078 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1079 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1083 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1085 seq.setUnaligned(rawSequence.substr(oligo.length()));
1086 if(qual.getName() != ""){
1087 qual.trimQScores(oligo.length(), -1);
1094 //if you found the barcode or if you don't want to allow for diffs
1095 if ((pdiffs == 0) || (success == 0)) { return success; }
1097 else { //try aligning and see if you can find it
1101 Alignment* alignment;
1102 if (primers.size() > 0) {
1103 map<string,int>::iterator it=primers.begin();
1105 for(it;it!=primers.end();it++){
1106 if(it->first.length() > maxLength){
1107 maxLength = it->first.length();
1110 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1112 }else{ alignment = NULL; }
1114 //can you find the barcode
1120 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1121 string oligo = it->first;
1122 // int length = oligo.length();
1124 if(rawSequence.length() < maxLength){
1125 success = pdiffs + 100;
1129 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1130 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1131 oligo = alignment->getSeqAAln();
1132 string temp = alignment->getSeqBAln();
1134 int alnLength = oligo.length();
1136 for(int i=oligo.length()-1;i>=0;i--){
1137 if(oligo[i] != '-'){ alnLength = i+1; break; }
1139 oligo = oligo.substr(0,alnLength);
1140 temp = temp.substr(0,alnLength);
1142 int numDiff = countDiffs(oligo, temp);
1144 if(numDiff < minDiff){
1147 minGroup = it->second;
1149 for(int i=0;i<alnLength;i++){
1155 else if(numDiff == minDiff){
1161 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1162 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1163 else{ //use the best match
1165 seq.setUnaligned(rawSequence.substr(minPos));
1166 if(qual.getName() != ""){
1167 qual.trimQScores(minPos, -1);
1172 if (alignment != NULL) { delete alignment; }
1179 catch(exception& e) {
1180 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1185 //***************************************************************************************************************
1187 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1189 string rawSequence = seq.getUnaligned();
1190 bool success = 0; //guilty until proven innocent
1192 for(int i=0;i<numRPrimers;i++){
1193 string oligo = revPrimer[i];
1195 if(rawSequence.length() < oligo.length()){
1200 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1201 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1202 if(qual.getName() != ""){
1203 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1212 catch(exception& e) {
1213 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1218 //***************************************************************************************************************
1220 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1223 if(qscores.getName() != ""){
1224 qscores.trimQScores(-1, keepFirst);
1226 sequence.trim(keepFirst);
1229 catch(exception& e) {
1230 m->errorOut(e, "keepFirstTrim", "countDiffs");
1236 //***************************************************************************************************************
1238 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1242 int length = sequence.getNumBases() - removeLast;
1245 if(qscores.getName() != ""){
1246 qscores.trimQScores(-1, length);
1248 sequence.trim(length);
1257 catch(exception& e) {
1258 m->errorOut(e, "removeLastTrim", "countDiffs");
1264 //***************************************************************************************************************
1266 bool TrimSeqsCommand::cullLength(Sequence& seq){
1269 int length = seq.getNumBases();
1270 bool success = 0; //guilty until proven innocent
1272 if(length >= minLength && maxLength == 0) { success = 1; }
1273 else if(length >= minLength && length <= maxLength) { success = 1; }
1274 else { success = 0; }
1279 catch(exception& e) {
1280 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1286 //***************************************************************************************************************
1288 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1290 int longHomoP = seq.getLongHomoPolymer();
1291 bool success = 0; //guilty until proven innocent
1293 if(longHomoP <= maxHomoP){ success = 1; }
1294 else { success = 0; }
1298 catch(exception& e) {
1299 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1305 //***************************************************************************************************************
1307 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1309 int numNs = seq.getAmbigBases();
1310 bool success = 0; //guilty until proven innocent
1312 if(numNs <= maxAmbig) { success = 1; }
1313 else { success = 0; }
1317 catch(exception& e) {
1318 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1324 //***************************************************************************************************************
1326 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1329 int length = oligo.length();
1331 for(int i=0;i<length;i++){
1333 if(oligo[i] != seq[i]){
1334 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1335 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1336 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1337 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1338 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1339 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1340 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1341 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1342 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1343 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1344 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1345 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1347 if(success == 0) { break; }
1356 catch(exception& e) {
1357 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1363 //***************************************************************************************************************
1365 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1368 int length = oligo.length();
1371 for(int i=0;i<length;i++){
1373 if(oligo[i] != seq[i]){
1374 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1375 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1376 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1377 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1378 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1379 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1380 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1381 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1382 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1383 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1384 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1385 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1392 catch(exception& e) {
1393 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1399 //***************************************************************************************************************