5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::getValidParameters(){
17 string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile",
18 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19 "keepfirst", "removelast",
20 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
30 //**********************************************************************************************************************
32 TrimSeqsCommand::TrimSeqsCommand(){
34 abort = true; calledHelp = true;
35 vector<string> tempOutNames;
36 outputTypes["fasta"] = tempOutNames;
37 outputTypes["qual"] = tempOutNames;
40 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
45 //**********************************************************************************************************************
47 vector<string> TrimSeqsCommand::getRequiredParameters(){
49 string Array[] = {"fasta"};
50 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
59 //**********************************************************************************************************************
61 vector<string> TrimSeqsCommand::getRequiredFiles(){
63 vector<string> myArray;
67 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
72 //***************************************************************************************************************
74 TrimSeqsCommand::TrimSeqsCommand(string option) {
77 abort = false; calledHelp = false;
80 //allow user to run help
81 if(option == "help") { help(); abort = true; calledHelp = true; }
84 //valid paramters for this command
85 string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile",
86 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
87 "keepfirst", "removelast",
88 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
90 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
92 OptionParser parser(option);
93 map<string,string> parameters = parser.getParameters();
95 ValidParameters validParameter;
96 map<string,string>::iterator it;
98 //check to make sure all parameters are valid for command
99 for (it = parameters.begin(); it != parameters.end(); it++) {
100 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
103 //initialize outputTypes
104 vector<string> tempOutNames;
105 outputTypes["fasta"] = tempOutNames;
106 outputTypes["qual"] = tempOutNames;
108 //if the user changes the input directory command factory will send this info to us in the output parameter
109 string inputDir = validParameter.validFile(parameters, "inputdir", false);
110 if (inputDir == "not found"){ inputDir = ""; }
113 it = parameters.find("fasta");
114 //user has given a template file
115 if(it != parameters.end()){
116 path = m->hasPath(it->second);
117 //if the user has not given a path then, add inputdir. else leave path alone.
118 if (path == "") { parameters["fasta"] = inputDir + it->second; }
121 it = parameters.find("oligos");
122 //user has given a template file
123 if(it != parameters.end()){
124 path = m->hasPath(it->second);
125 //if the user has not given a path then, add inputdir. else leave path alone.
126 if (path == "") { parameters["oligos"] = inputDir + it->second; }
129 it = parameters.find("qfile");
130 //user has given a template file
131 if(it != parameters.end()){
132 path = m->hasPath(it->second);
133 //if the user has not given a path then, add inputdir. else leave path alone.
134 if (path == "") { parameters["qfile"] = inputDir + it->second; }
140 //check for required parameters
141 fastaFile = validParameter.validFile(parameters, "fasta", true);
142 if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
143 else if (fastaFile == "not open") { abort = true; }
145 //if the user changes the output directory command factory will send this info to us in the output parameter
146 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
148 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
152 //check for optional parameter and set defaults
153 // ...at some point should added some additional type checking...
155 temp = validParameter.validFile(parameters, "flip", false);
156 if (temp == "not found"){ flip = 0; }
157 else if(m->isTrue(temp)) { flip = 1; }
159 temp = validParameter.validFile(parameters, "oligos", true);
160 if (temp == "not found"){ oligoFile = ""; }
161 else if(temp == "not open"){ abort = true; }
162 else { oligoFile = temp; }
165 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
166 convert(temp, maxAmbig);
168 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
169 convert(temp, maxHomoP);
171 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
172 convert(temp, minLength);
174 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
175 convert(temp, maxLength);
177 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
178 convert(temp, bdiffs);
180 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
181 convert(temp, pdiffs);
183 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
184 convert(temp, tdiffs);
186 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
188 temp = validParameter.validFile(parameters, "qfile", true);
189 if (temp == "not found") { qFileName = ""; }
190 else if(temp == "not open") { abort = true; }
191 else { qFileName = temp; }
193 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
194 convert(temp, qThreshold);
196 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
197 qtrim = m->isTrue(temp);
199 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
200 convert(temp, qRollAverage);
202 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
203 convert(temp, qWindowAverage);
205 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
206 convert(temp, qWindowSize);
208 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
209 convert(temp, qWindowStep);
211 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
212 convert(temp, qAverage);
214 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
215 convert(temp, keepFirst);
217 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
218 convert(temp, removeLast);
220 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
221 allFiles = m->isTrue(temp);
223 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
224 convert(temp, processors);
226 if ((oligoFile != "") && (groupfile != "")) {
227 m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true;
231 if(allFiles && (oligoFile == "") && (groupfile == "")){
232 m->mothurOut("You selected allfiles, but didn't enter an oligos or group file. Ignoring the allfiles request."); m->mothurOutEndLine();
234 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
235 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
239 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
240 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
246 catch(exception& e) {
247 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
252 //**********************************************************************************************************************
254 void TrimSeqsCommand::help(){
256 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
257 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
258 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
259 m->mothurOut("The fasta parameter is required.\n");
260 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
261 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
262 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
263 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
264 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
265 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
266 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
267 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
268 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
269 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
270 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
271 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
272 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
273 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
274 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
275 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
276 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
277 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
278 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
279 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
280 m->mothurOut("The trim.seqs command should be in the following format: \n");
281 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n");
282 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n");
283 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
284 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
285 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
288 catch(exception& e) {
289 m->errorOut(e, "TrimSeqsCommand", "help");
295 //***************************************************************************************************************
297 TrimSeqsCommand::~TrimSeqsCommand(){ /* do nothing */ }
299 //***************************************************************************************************************
301 int TrimSeqsCommand::execute(){
304 if (abort == true) { if (calledHelp) { return 0; } return 2; }
306 numFPrimers = 0; //this needs to be initialized
308 vector<vector<string> > fastaFileNames;
309 vector<vector<string> > qualFileNames;
311 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
312 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
314 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
315 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
317 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
318 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
319 if (qFileName != "") {
320 outputNames.push_back(trimQualFile);
321 outputNames.push_back(scrapQualFile);
322 outputTypes["qual"].push_back(trimQualFile);
323 outputTypes["qual"].push_back(scrapQualFile);
326 string outputGroupFileName;
329 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
330 outputNames.push_back(outputGroupFileName); outputTypes["groups"].push_back(outputGroupFileName);
331 getOligos(fastaFileNames, qualFileNames);
334 vector<unsigned long int> fastaFilePos;
335 vector<unsigned long int> qFilePos;
337 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
339 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
340 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
341 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
343 if(qFileName == "") { qLines = lines; } //files with duds
345 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
347 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
349 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames);
352 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
355 if (m->control_pressed) { return 0; }
359 for(int i=0;i<fastaFileNames.size();i++){
360 for(int j=0;j<fastaFileNames[0].size();j++){
361 if(m->isBlank(fastaFileNames[i][j])){
362 remove(fastaFileNames[i][j].c_str());
365 remove(fastaFileNames[i][j].c_str());
375 if (m->control_pressed) {
376 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
380 m->mothurOutEndLine();
381 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
382 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
383 m->mothurOutEndLine();
388 catch(exception& e) {
389 m->errorOut(e, "TrimSeqsCommand", "execute");
394 /**************************************************************************************/
396 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {
400 ofstream trimFASTAFile;
401 m->openOutputFile(trimFileName, trimFASTAFile);
403 ofstream scrapFASTAFile;
404 m->openOutputFile(scrapFileName, scrapFASTAFile);
406 ofstream trimQualFile;
407 ofstream scrapQualFile;
409 m->openOutputFile(trimQFileName, trimQualFile);
410 m->openOutputFile(scrapQFileName, scrapQualFile);
413 ofstream outGroupsFile;
414 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
417 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
418 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
420 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
422 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
429 m->openInputFile(filename, inFASTA);
430 inFASTA.seekg(line->start);
433 if(qFileName != "") {
434 m->openInputFile(qFileName, qFile);
435 qFile.seekg(qline->start);
443 if (m->control_pressed) {
444 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
445 if (oligoFile != "") { outGroupsFile.close(); }
450 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
456 string trashCode = "";
457 int currentSeqsDiffs = 0;
459 Sequence currSeq(inFASTA); m->gobble(inFASTA);
461 QualityScores currQual;
463 currQual = QualityScores(qFile); m->gobble(qFile);
466 string origSeq = currSeq.getUnaligned();
469 int barcodeIndex = 0;
472 if(barcodes.size() != 0){
473 success = stripBarcode(currSeq, currQual, barcodeIndex);
474 if(success > bdiffs) { trashCode += 'b'; }
475 else{ currentSeqsDiffs += success; }
478 if(numFPrimers != 0){
479 success = stripForward(currSeq, currQual, primerIndex);
480 if(success > pdiffs) { trashCode += 'f'; }
481 else{ currentSeqsDiffs += success; }
484 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
486 if(numRPrimers != 0){
487 success = stripReverse(currSeq, currQual);
488 if(!success) { trashCode += 'r'; }
492 success = keepFirstTrim(currSeq, currQual);
496 success = removeLastTrim(currSeq, currQual);
497 if(!success) { trashCode += 'l'; }
502 int origLength = currSeq.getNumBases();
504 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
505 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
506 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
507 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
508 else { success = 1; }
510 //you don't want to trim, if it fails above then scrap it
511 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
513 if(!success) { trashCode += 'q'; }
516 if(minLength > 0 || maxLength > 0){
517 success = cullLength(currSeq);
518 if(!success) { trashCode += 'l'; }
521 success = cullHomoP(currSeq);
522 if(!success) { trashCode += 'h'; }
525 success = cullAmbigs(currSeq);
526 if(!success) { trashCode += 'n'; }
529 if(flip){ // should go last
530 currSeq.reverseComplement();
532 currQual.flipQScores();
536 if(trashCode.length() == 0){
537 currSeq.setAligned(currSeq.getUnaligned());
538 currSeq.printSequence(trimFASTAFile);
541 currQual.printQScores(trimQualFile);
544 if(barcodes.size() != 0){
545 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
551 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
552 currSeq.printSequence(output);
556 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
557 currQual.printQScores(output);
563 currSeq.setName(currSeq.getName() + '|' + trashCode);
564 currSeq.setUnaligned(origSeq);
565 currSeq.setAligned(origSeq);
566 currSeq.printSequence(scrapFASTAFile);
568 currQual.printQScores(scrapQualFile);
574 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
575 unsigned long int pos = inFASTA.tellg();
576 if ((pos == -1) || (pos >= line->end)) { break; }
578 if (inFASTA.eof()) { break; }
582 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
586 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
590 trimFASTAFile.close();
591 scrapFASTAFile.close();
592 if (oligoFile != "") { outGroupsFile.close(); }
593 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
597 catch(exception& e) {
598 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
603 /**************************************************************************************************/
605 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
607 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
612 //loop through and create all the processes you want
613 while (process != processors) {
617 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
621 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
622 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
627 for(int i=0;i<tempFASTAFileNames.size();i++){
628 for(int j=0;j<tempFASTAFileNames[i].size();j++){
629 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
630 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
633 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
634 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
640 driverCreateTrim(filename,
642 (trimFASTAFileName + toString(getpid()) + ".temp"),
643 (scrapFASTAFileName + toString(getpid()) + ".temp"),
644 (trimQualFileName + toString(getpid()) + ".temp"),
645 (scrapQualFileName + toString(getpid()) + ".temp"),
646 (groupFile + toString(getpid()) + ".temp"),
648 tempPrimerQualFileNames,
654 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
655 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
662 m->openOutputFile(trimFASTAFileName, temp); temp.close();
663 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
664 m->openOutputFile(trimQualFileName, temp); temp.close();
665 m->openOutputFile(scrapQualFileName, temp); temp.close();
669 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
672 //force parent to wait until all the processes are done
673 for (int i=0;i<processIDS.size();i++) {
674 int temp = processIDS[i];
679 for(int i=0;i<processIDS.size();i++){
681 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
683 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
684 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
685 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
686 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
689 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
690 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
691 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
692 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
695 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
696 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
700 for(int j=0;j<fastaFileNames.size();j++){
701 for(int k=0;k<fastaFileNames[j].size();k++){
702 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
703 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
706 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
707 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
718 catch(exception& e) {
719 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
724 /**************************************************************************************************/
726 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
729 //set file positions for fasta file
730 fastaFilePos = m->divideFile(filename, processors);
732 if (qfilename == "") { return processors; }
734 //get name of first sequence in each chunk
735 map<string, int> firstSeqNames;
736 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
738 m->openInputFile(filename, in);
739 in.seekg(fastaFilePos[i]);
742 firstSeqNames[temp.getName()] = i;
747 //seach for filePos of each first name in the qfile and save in qfileFilePos
749 m->openInputFile(qfilename, inQual);
752 while(!inQual.eof()){
753 input = m->getline(inQual);
755 if (input.length() != 0) {
756 if(input[0] == '>'){ //this is a sequence name line
757 istringstream nameStream(input);
759 string sname = ""; nameStream >> sname;
760 sname = sname.substr(1);
762 map<string, int>::iterator it = firstSeqNames.find(sname);
764 if(it != firstSeqNames.end()) { //this is the start of a new chunk
765 unsigned long int pos = inQual.tellg();
766 qfileFilePos.push_back(pos - input.length() - 1);
767 firstSeqNames.erase(it);
772 if (firstSeqNames.size() == 0) { break; }
777 if (firstSeqNames.size() != 0) {
778 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
779 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
785 //get last file position of qfile
787 unsigned long int size;
789 //get num bytes in file
790 pFile = fopen (qfilename.c_str(),"rb");
791 if (pFile==NULL) perror ("Error opening file");
793 fseek (pFile, 0, SEEK_END);
798 qfileFilePos.push_back(size);
802 catch(exception& e) {
803 m->errorOut(e, "TrimSeqsCommand", "setLines");
808 //***************************************************************************************************************
810 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
813 m->openInputFile(oligoFile, inOligos);
817 string type, oligo, group;
820 int indexBarcode = 0;
822 while(!inOligos.eof()){
824 inOligos >> type; m->gobble(inOligos);
827 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
830 //make type case insensitive
831 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
835 for(int i=0;i<oligo.length();i++){
836 oligo[i] = toupper(oligo[i]);
837 if(oligo[i] == 'U') { oligo[i] = 'T'; }
840 if(type == "FORWARD"){
843 // get rest of line in case there is a primer name
844 while (!inOligos.eof()) {
845 char c = inOligos.get();
846 if (c == 10 || c == 13){ break; }
847 else if (c == 32 || c == 9){;} //space or tab
851 //check for repeat barcodes
852 map<string, int>::iterator itPrime = primers.find(oligo);
853 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
855 primers[oligo]=indexPrimer; indexPrimer++;
856 primerNameVector.push_back(group);
858 else if(type == "REVERSE"){
859 Sequence oligoRC("reverse", oligo);
860 oligoRC.reverseComplement();
861 revPrimer.push_back(oligoRC.getUnaligned());
863 else if(type == "BARCODE"){
866 //check for repeat barcodes
867 map<string, int>::iterator itBar = barcodes.find(oligo);
868 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
870 barcodes[oligo]=indexBarcode; indexBarcode++;
871 barcodeNameVector.push_back(group);
873 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
879 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
881 //add in potential combos
882 if(barcodeNameVector.size() == 0){
884 barcodeNameVector.push_back("");
887 if(primerNameVector.size() == 0){
889 primerNameVector.push_back("");
892 fastaFileNames.resize(barcodeNameVector.size());
893 for(int i=0;i<fastaFileNames.size();i++){
894 fastaFileNames[i].assign(primerNameVector.size(), "");
896 if(qFileName != ""){ qualFileNames = fastaFileNames; }
899 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
900 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
902 string primerName = primerNameVector[itPrimer->second];
903 string barcodeName = barcodeNameVector[itBar->second];
905 string comboGroupName = "";
906 string fastaFileName = "";
907 string qualFileName = "";
909 if(primerName == ""){
910 comboGroupName = barcodeNameVector[itBar->second];
913 if(barcodeName == ""){
914 comboGroupName = primerNameVector[itPrimer->second];
917 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
922 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
923 outputNames.push_back(fastaFileName);
924 outputTypes["fasta"].push_back(fastaFileName);
925 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
926 m->openOutputFile(fastaFileName, temp); temp.close();
929 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
930 outputNames.push_back(qualFileName);
931 outputTypes["qual"].push_back(qualFileName);
932 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
933 m->openOutputFile(qualFileName, temp); temp.close();
938 numFPrimers = primers.size();
939 numRPrimers = revPrimer.size();
942 catch(exception& e) {
943 m->errorOut(e, "TrimSeqsCommand", "getOligos");
948 //***************************************************************************************************************
950 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
953 string rawSequence = seq.getUnaligned();
954 int success = bdiffs + 1; //guilty until proven innocent
956 //can you find the barcode
957 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
958 string oligo = it->first;
959 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
960 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
964 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
966 seq.setUnaligned(rawSequence.substr(oligo.length()));
968 if(qual.getName() != ""){
969 qual.trimQScores(oligo.length(), -1);
977 //if you found the barcode or if you don't want to allow for diffs
978 if ((bdiffs == 0) || (success == 0)) { return success; }
980 else { //try aligning and see if you can find it
984 Alignment* alignment;
985 if (barcodes.size() > 0) {
986 map<string,int>::iterator it=barcodes.begin();
988 for(it;it!=barcodes.end();it++){
989 if(it->first.length() > maxLength){
990 maxLength = it->first.length();
993 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
995 }else{ alignment = NULL; }
997 //can you find the barcode
1003 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1004 string oligo = it->first;
1005 // int length = oligo.length();
1007 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1008 success = bdiffs + 10;
1012 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1013 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1014 oligo = alignment->getSeqAAln();
1015 string temp = alignment->getSeqBAln();
1017 int alnLength = oligo.length();
1019 for(int i=oligo.length()-1;i>=0;i--){
1020 if(oligo[i] != '-'){ alnLength = i+1; break; }
1022 oligo = oligo.substr(0,alnLength);
1023 temp = temp.substr(0,alnLength);
1025 int numDiff = countDiffs(oligo, temp);
1027 if(numDiff < minDiff){
1030 minGroup = it->second;
1032 for(int i=0;i<alnLength;i++){
1038 else if(numDiff == minDiff){
1044 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1045 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1046 else{ //use the best match
1048 seq.setUnaligned(rawSequence.substr(minPos));
1050 if(qual.getName() != ""){
1051 qual.trimQScores(minPos, -1);
1056 if (alignment != NULL) { delete alignment; }
1063 catch(exception& e) {
1064 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1070 //***************************************************************************************************************
1072 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1074 string rawSequence = seq.getUnaligned();
1075 int success = pdiffs + 1; //guilty until proven innocent
1077 //can you find the primer
1078 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1079 string oligo = it->first;
1080 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1081 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1085 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1087 seq.setUnaligned(rawSequence.substr(oligo.length()));
1088 if(qual.getName() != ""){
1089 qual.trimQScores(oligo.length(), -1);
1096 //if you found the barcode or if you don't want to allow for diffs
1097 if ((pdiffs == 0) || (success == 0)) { return success; }
1099 else { //try aligning and see if you can find it
1103 Alignment* alignment;
1104 if (primers.size() > 0) {
1105 map<string,int>::iterator it=primers.begin();
1107 for(it;it!=primers.end();it++){
1108 if(it->first.length() > maxLength){
1109 maxLength = it->first.length();
1112 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1114 }else{ alignment = NULL; }
1116 //can you find the barcode
1122 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1123 string oligo = it->first;
1124 // int length = oligo.length();
1126 if(rawSequence.length() < maxLength){
1127 success = pdiffs + 100;
1131 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1132 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1133 oligo = alignment->getSeqAAln();
1134 string temp = alignment->getSeqBAln();
1136 int alnLength = oligo.length();
1138 for(int i=oligo.length()-1;i>=0;i--){
1139 if(oligo[i] != '-'){ alnLength = i+1; break; }
1141 oligo = oligo.substr(0,alnLength);
1142 temp = temp.substr(0,alnLength);
1144 int numDiff = countDiffs(oligo, temp);
1146 if(numDiff < minDiff){
1149 minGroup = it->second;
1151 for(int i=0;i<alnLength;i++){
1157 else if(numDiff == minDiff){
1163 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1164 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1165 else{ //use the best match
1167 seq.setUnaligned(rawSequence.substr(minPos));
1168 if(qual.getName() != ""){
1169 qual.trimQScores(minPos, -1);
1174 if (alignment != NULL) { delete alignment; }
1181 catch(exception& e) {
1182 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1187 //***************************************************************************************************************
1189 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1191 string rawSequence = seq.getUnaligned();
1192 bool success = 0; //guilty until proven innocent
1194 for(int i=0;i<numRPrimers;i++){
1195 string oligo = revPrimer[i];
1197 if(rawSequence.length() < oligo.length()){
1202 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1203 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1204 if(qual.getName() != ""){
1205 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1214 catch(exception& e) {
1215 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1220 //***************************************************************************************************************
1222 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1225 if(qscores.getName() != ""){
1226 qscores.trimQScores(-1, keepFirst);
1228 sequence.trim(keepFirst);
1231 catch(exception& e) {
1232 m->errorOut(e, "keepFirstTrim", "countDiffs");
1238 //***************************************************************************************************************
1240 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1244 int length = sequence.getNumBases() - removeLast;
1247 if(qscores.getName() != ""){
1248 qscores.trimQScores(-1, length);
1250 sequence.trim(length);
1259 catch(exception& e) {
1260 m->errorOut(e, "removeLastTrim", "countDiffs");
1266 //***************************************************************************************************************
1268 bool TrimSeqsCommand::cullLength(Sequence& seq){
1271 int length = seq.getNumBases();
1272 bool success = 0; //guilty until proven innocent
1274 if(length >= minLength && maxLength == 0) { success = 1; }
1275 else if(length >= minLength && length <= maxLength) { success = 1; }
1276 else { success = 0; }
1281 catch(exception& e) {
1282 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1288 //***************************************************************************************************************
1290 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1292 int longHomoP = seq.getLongHomoPolymer();
1293 bool success = 0; //guilty until proven innocent
1295 if(longHomoP <= maxHomoP){ success = 1; }
1296 else { success = 0; }
1300 catch(exception& e) {
1301 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1307 //***************************************************************************************************************
1309 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1311 int numNs = seq.getAmbigBases();
1312 bool success = 0; //guilty until proven innocent
1314 if(numNs <= maxAmbig) { success = 1; }
1315 else { success = 0; }
1319 catch(exception& e) {
1320 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1326 //***************************************************************************************************************
1328 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1331 int length = oligo.length();
1333 for(int i=0;i<length;i++){
1335 if(oligo[i] != seq[i]){
1336 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1337 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1338 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1339 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1340 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1341 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1342 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1343 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1344 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1345 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1346 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1347 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1349 if(success == 0) { break; }
1358 catch(exception& e) {
1359 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1365 //***************************************************************************************************************
1367 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1370 int length = oligo.length();
1373 for(int i=0;i<length;i++){
1375 if(oligo[i] != seq[i]){
1376 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1377 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1378 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1379 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1380 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1381 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1382 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1383 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1384 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1385 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1386 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1387 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1394 catch(exception& e) {
1395 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1401 //***************************************************************************************************************