5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
15 vector<string> TrimSeqsCommand::getValidParameters(){
17 string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile",
18 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
19 "keepfirst", "removelast",
20 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
21 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25 m->errorOut(e, "TrimSeqsCommand", "getValidParameters");
30 //**********************************************************************************************************************
32 TrimSeqsCommand::TrimSeqsCommand(){
34 abort = true; calledHelp = true;
35 vector<string> tempOutNames;
36 outputTypes["fasta"] = tempOutNames;
37 outputTypes["qfile"] = tempOutNames;
38 outputTypes["group"] = tempOutNames;
41 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
46 //**********************************************************************************************************************
48 vector<string> TrimSeqsCommand::getRequiredParameters(){
50 string Array[] = {"fasta"};
51 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
55 m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters");
60 //**********************************************************************************************************************
62 vector<string> TrimSeqsCommand::getRequiredFiles(){
64 vector<string> myArray;
68 m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles");
73 //***************************************************************************************************************
75 TrimSeqsCommand::TrimSeqsCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
85 //valid paramters for this command
86 string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile",
87 "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
88 "keepfirst", "removelast",
89 "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
91 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
93 OptionParser parser(option);
94 map<string,string> parameters = parser.getParameters();
96 ValidParameters validParameter;
97 map<string,string>::iterator it;
99 //check to make sure all parameters are valid for command
100 for (it = parameters.begin(); it != parameters.end(); it++) {
101 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
104 //initialize outputTypes
105 vector<string> tempOutNames;
106 outputTypes["fasta"] = tempOutNames;
107 outputTypes["qfile"] = tempOutNames;
108 outputTypes["group"] = tempOutNames;
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("fasta");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["fasta"] = inputDir + it->second; }
123 it = parameters.find("oligos");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["oligos"] = inputDir + it->second; }
131 it = parameters.find("qfile");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["qfile"] = inputDir + it->second; }
142 //check for required parameters
143 fastaFile = validParameter.validFile(parameters, "fasta", true);
144 if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; }
145 else if (fastaFile == "not open") { abort = true; }
147 //if the user changes the output directory command factory will send this info to us in the output parameter
148 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
150 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
154 //check for optional parameter and set defaults
155 // ...at some point should added some additional type checking...
157 temp = validParameter.validFile(parameters, "flip", false);
158 if (temp == "not found"){ flip = 0; }
159 else if(m->isTrue(temp)) { flip = 1; }
161 temp = validParameter.validFile(parameters, "oligos", true);
162 if (temp == "not found"){ oligoFile = ""; }
163 else if(temp == "not open"){ abort = true; }
164 else { oligoFile = temp; }
167 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
168 convert(temp, maxAmbig);
170 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
171 convert(temp, maxHomoP);
173 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
174 convert(temp, minLength);
176 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
177 convert(temp, maxLength);
179 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
180 convert(temp, bdiffs);
182 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
183 convert(temp, pdiffs);
185 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
186 convert(temp, tdiffs);
188 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
190 temp = validParameter.validFile(parameters, "qfile", true);
191 if (temp == "not found") { qFileName = ""; }
192 else if(temp == "not open") { abort = true; }
193 else { qFileName = temp; }
195 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
196 convert(temp, qThreshold);
198 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
199 qtrim = m->isTrue(temp);
201 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
202 convert(temp, qRollAverage);
204 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
205 convert(temp, qWindowAverage);
207 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
208 convert(temp, qWindowSize);
210 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
211 convert(temp, qWindowStep);
213 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
214 convert(temp, qAverage);
216 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
217 convert(temp, keepFirst);
219 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
220 convert(temp, removeLast);
222 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
223 allFiles = m->isTrue(temp);
225 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
226 convert(temp, processors);
229 if(allFiles && (oligoFile == "")){
230 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
232 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
233 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
237 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
238 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
244 catch(exception& e) {
245 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
250 //**********************************************************************************************************************
252 void TrimSeqsCommand::help(){
254 m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
255 m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
256 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
257 m->mothurOut("The fasta parameter is required.\n");
258 m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
259 m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
260 m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
261 m->mothurOut("The maxhomop parameter allows you to set a maximum homopolymer length. \n");
262 m->mothurOut("The minlength parameter allows you to set and minimum sequence length. \n");
263 m->mothurOut("The maxlength parameter allows you to set and maximum sequence length. \n");
264 m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n");
265 m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n");
266 m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n");
267 m->mothurOut("The qfile parameter allows you to provide a quality file.\n");
268 m->mothurOut("The qthreshold parameter allows you to set a minimum quality score allowed. \n");
269 m->mothurOut("The qaverage parameter allows you to set a minimum average quality score allowed. \n");
270 m->mothurOut("The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n");
271 m->mothurOut("The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n");
272 m->mothurOut("The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n");
273 m->mothurOut("The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n");
274 m->mothurOut("The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n");
275 m->mothurOut("The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n");
276 m->mothurOut("The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n");
277 m->mothurOut("The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n");
278 m->mothurOut("The trim.seqs command should be in the following format: \n");
279 m->mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n");
280 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n");
281 m->mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
282 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n");
283 m->mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n");
286 catch(exception& e) {
287 m->errorOut(e, "TrimSeqsCommand", "help");
293 //***************************************************************************************************************
295 TrimSeqsCommand::~TrimSeqsCommand(){ /* do nothing */ }
297 //***************************************************************************************************************
299 int TrimSeqsCommand::execute(){
302 if (abort == true) { if (calledHelp) { return 0; } return 2; }
304 numFPrimers = 0; //this needs to be initialized
306 vector<vector<string> > fastaFileNames;
307 vector<vector<string> > qualFileNames;
309 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
310 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
312 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
313 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
315 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
316 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
317 if (qFileName != "") {
318 outputNames.push_back(trimQualFile);
319 outputNames.push_back(scrapQualFile);
320 outputTypes["qfile"].push_back(trimQualFile);
321 outputTypes["qfile"].push_back(scrapQualFile);
324 string outputGroupFileName;
326 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
327 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
328 getOligos(fastaFileNames, qualFileNames);
331 vector<unsigned long int> fastaFilePos;
332 vector<unsigned long int> qFilePos;
334 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
336 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
337 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
338 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
340 if(qFileName == "") { qLines = lines; } //files with duds
342 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
344 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
346 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames);
349 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
352 if (m->control_pressed) { return 0; }
355 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
356 map<string, string>::iterator it;
357 set<string> namesToRemove;
358 for(int i=0;i<fastaFileNames.size();i++){
359 for(int j=0;j<fastaFileNames[0].size();j++){
360 if (fastaFileNames[i][j] != "") {
361 if(m->isBlank(fastaFileNames[i][j])){
362 remove(fastaFileNames[i][j].c_str());
363 namesToRemove.insert(fastaFileNames[i][j]);
366 remove(qualFileNames[i][j].c_str());
367 namesToRemove.insert(qualFileNames[i][j]);
370 it = uniqueFastaNames.find(fastaFileNames[i][j]);
371 if (it == uniqueFastaNames.end()) {
372 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
379 //remove names for outputFileNames, just cleans up the output
380 vector<string> outputNames2;
381 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
382 outputNames = outputNames2;
384 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
386 m->openInputFile(it->first, in);
389 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
390 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
391 m->openOutputFile(thisGroupName, out);
394 if (m->control_pressed) { break; }
396 Sequence currSeq(in); m->gobble(in);
397 out << currSeq.getName() << '\t' << it->second << endl;
404 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
406 //output group counts
407 m->mothurOutEndLine();
409 // for (int i = 0; i < barcodeNameVector.size(); i++) {
410 // if ((barcodeNameVector[i] != "") && (groupCounts[i] != 0)) { total += groupCounts[i]; m->mothurOut("Group " + barcodeNameVector[i] + " contains " + toString(groupCounts[i]) + " sequences."); m->mothurOutEndLine(); }
412 // if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
414 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
416 //set fasta file as new current fastafile
418 itTypes = outputTypes.find("fasta");
419 if (itTypes != outputTypes.end()) {
420 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
423 itTypes = outputTypes.find("qfile");
424 if (itTypes != outputTypes.end()) {
425 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
428 itTypes = outputTypes.find("group");
429 if (itTypes != outputTypes.end()) {
430 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
433 m->mothurOutEndLine();
434 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
435 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
436 m->mothurOutEndLine();
441 catch(exception& e) {
442 m->errorOut(e, "TrimSeqsCommand", "execute");
447 /**************************************************************************************/
449 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, linePair* line, linePair* qline) {
453 ofstream trimFASTAFile;
454 m->openOutputFile(trimFileName, trimFASTAFile);
456 ofstream scrapFASTAFile;
457 m->openOutputFile(scrapFileName, scrapFASTAFile);
459 ofstream trimQualFile;
460 ofstream scrapQualFile;
462 m->openOutputFile(trimQFileName, trimQualFile);
463 m->openOutputFile(scrapQFileName, scrapQualFile);
466 ofstream outGroupsFile;
467 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
469 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
470 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
471 if (fastaFileNames[i][j] != "") {
473 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
475 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
483 m->openInputFile(filename, inFASTA);
484 inFASTA.seekg(line->start);
487 if(qFileName != "") {
488 m->openInputFile(qFileName, qFile);
489 qFile.seekg(qline->start);
497 if (m->control_pressed) {
498 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
499 if (oligoFile != "") { outGroupsFile.close(); }
504 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
510 string trashCode = "";
511 int currentSeqsDiffs = 0;
513 Sequence currSeq(inFASTA); m->gobble(inFASTA);
515 QualityScores currQual;
517 currQual = QualityScores(qFile); m->gobble(qFile);
520 string origSeq = currSeq.getUnaligned();
523 int barcodeIndex = 0;
526 if(barcodes.size() != 0){
527 success = stripBarcode(currSeq, currQual, barcodeIndex);
528 if(success > bdiffs) { trashCode += 'b'; }
529 else{ currentSeqsDiffs += success; }
532 if(numFPrimers != 0){
533 success = stripForward(currSeq, currQual, primerIndex);
534 if(success > pdiffs) { trashCode += 'f'; }
535 else{ currentSeqsDiffs += success; }
538 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
540 if(numRPrimers != 0){
541 success = stripReverse(currSeq, currQual);
542 if(!success) { trashCode += 'r'; }
546 success = keepFirstTrim(currSeq, currQual);
550 success = removeLastTrim(currSeq, currQual);
551 if(!success) { trashCode += 'l'; }
556 int origLength = currSeq.getNumBases();
558 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
559 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
560 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
561 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
562 else { success = 1; }
564 //you don't want to trim, if it fails above then scrap it
565 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
567 if(!success) { trashCode += 'q'; }
570 if(minLength > 0 || maxLength > 0){
571 success = cullLength(currSeq);
572 if(!success) { trashCode += 'l'; }
575 success = cullHomoP(currSeq);
576 if(!success) { trashCode += 'h'; }
579 success = cullAmbigs(currSeq);
580 if(!success) { trashCode += 'n'; }
583 if(flip){ // should go last
584 currSeq.reverseComplement();
586 currQual.flipQScores();
590 if(trashCode.length() == 0){
591 currSeq.setAligned(currSeq.getUnaligned());
592 currSeq.printSequence(trimFASTAFile);
595 currQual.printQScores(trimQualFile);
598 if(barcodes.size() != 0){
599 outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
600 groupCounts[barcodeIndex]++;
606 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
607 currSeq.printSequence(output);
611 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
612 currQual.printQScores(output);
618 currSeq.setName(currSeq.getName() + '|' + trashCode);
619 currSeq.setUnaligned(origSeq);
620 currSeq.setAligned(origSeq);
621 currSeq.printSequence(scrapFASTAFile);
623 currQual.printQScores(scrapQualFile);
629 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
630 unsigned long int pos = inFASTA.tellg();
631 if ((pos == -1) || (pos >= line->end)) { break; }
633 if (inFASTA.eof()) { break; }
637 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
641 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
645 trimFASTAFile.close();
646 scrapFASTAFile.close();
647 if (oligoFile != "") { outGroupsFile.close(); }
648 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
652 catch(exception& e) {
653 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
658 /**************************************************************************************************/
660 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames) {
662 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
667 //loop through and create all the processes you want
668 while (process != processors) {
672 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
676 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
677 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
682 for(int i=0;i<tempFASTAFileNames.size();i++){
683 for(int j=0;j<tempFASTAFileNames[i].size();j++){
684 if (tempFASTAFileNames[i][j] != "") {
685 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
686 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
689 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
690 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
697 driverCreateTrim(filename,
699 (trimFASTAFileName + toString(getpid()) + ".temp"),
700 (scrapFASTAFileName + toString(getpid()) + ".temp"),
701 (trimQualFileName + toString(getpid()) + ".temp"),
702 (scrapQualFileName + toString(getpid()) + ".temp"),
703 (groupFile + toString(getpid()) + ".temp"),
705 tempPrimerQualFileNames,
709 //pass groupCounts to parent
711 string tempFile = filename + toString(getpid()) + ".num.temp";
712 m->openOutputFile(tempFile, out);
713 for(int i = 0; i < groupCounts.size(); i++) {
714 out << groupCounts[i] << endl;
720 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
721 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
728 m->openOutputFile(trimFASTAFileName, temp); temp.close();
729 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
730 m->openOutputFile(trimQualFileName, temp); temp.close();
731 m->openOutputFile(scrapQualFileName, temp); temp.close();
733 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
735 //force parent to wait until all the processes are done
736 for (int i=0;i<processIDS.size();i++) {
737 int temp = processIDS[i];
742 for(int i=0;i<processIDS.size();i++){
744 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
746 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
747 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
748 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
749 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
752 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
753 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
754 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
755 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
758 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
759 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
763 for(int j=0;j<fastaFileNames.size();j++){
764 for(int k=0;k<fastaFileNames[j].size();k++){
765 if (fastaFileNames[j][k] != "") {
766 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
767 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
770 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
771 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
779 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
780 m->openInputFile(tempFile, in);
784 in >> tempNum; m->gobble(in);
785 groupCounts[count] += tempNum;
788 in.close(); remove(tempFile.c_str());
795 catch(exception& e) {
796 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
801 /**************************************************************************************************/
803 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
806 //set file positions for fasta file
807 fastaFilePos = m->divideFile(filename, processors);
809 if (qfilename == "") { return processors; }
811 //get name of first sequence in each chunk
812 map<string, int> firstSeqNames;
813 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
815 m->openInputFile(filename, in);
816 in.seekg(fastaFilePos[i]);
819 firstSeqNames[temp.getName()] = i;
824 //seach for filePos of each first name in the qfile and save in qfileFilePos
826 m->openInputFile(qfilename, inQual);
829 while(!inQual.eof()){
830 input = m->getline(inQual);
832 if (input.length() != 0) {
833 if(input[0] == '>'){ //this is a sequence name line
834 istringstream nameStream(input);
836 string sname = ""; nameStream >> sname;
837 sname = sname.substr(1);
839 map<string, int>::iterator it = firstSeqNames.find(sname);
841 if(it != firstSeqNames.end()) { //this is the start of a new chunk
842 unsigned long int pos = inQual.tellg();
843 qfileFilePos.push_back(pos - input.length() - 1);
844 firstSeqNames.erase(it);
849 if (firstSeqNames.size() == 0) { break; }
854 if (firstSeqNames.size() != 0) {
855 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
856 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
862 //get last file position of qfile
864 unsigned long int size;
866 //get num bytes in file
867 pFile = fopen (qfilename.c_str(),"rb");
868 if (pFile==NULL) perror ("Error opening file");
870 fseek (pFile, 0, SEEK_END);
875 qfileFilePos.push_back(size);
879 catch(exception& e) {
880 m->errorOut(e, "TrimSeqsCommand", "setLines");
885 //***************************************************************************************************************
887 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
890 m->openInputFile(oligoFile, inOligos);
894 string type, oligo, group;
897 int indexBarcode = 0;
899 while(!inOligos.eof()){
901 inOligos >> type; m->gobble(inOligos);
904 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
907 //make type case insensitive
908 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
912 for(int i=0;i<oligo.length();i++){
913 oligo[i] = toupper(oligo[i]);
914 if(oligo[i] == 'U') { oligo[i] = 'T'; }
917 if(type == "FORWARD"){
920 // get rest of line in case there is a primer name
921 while (!inOligos.eof()) {
922 char c = inOligos.get();
923 if (c == 10 || c == 13){ break; }
924 else if (c == 32 || c == 9){;} //space or tab
928 //check for repeat barcodes
929 map<string, int>::iterator itPrime = primers.find(oligo);
930 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
932 primers[oligo]=indexPrimer; indexPrimer++;
933 primerNameVector.push_back(group);
935 else if(type == "REVERSE"){
936 Sequence oligoRC("reverse", oligo);
937 oligoRC.reverseComplement();
938 revPrimer.push_back(oligoRC.getUnaligned());
940 else if(type == "BARCODE"){
943 //check for repeat barcodes
944 map<string, int>::iterator itBar = barcodes.find(oligo);
945 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
947 barcodes[oligo]=indexBarcode; indexBarcode++;
948 barcodeNameVector.push_back(group);
950 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
956 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
958 //add in potential combos
959 if(barcodeNameVector.size() == 0){
961 barcodeNameVector.push_back("");
964 if(primerNameVector.size() == 0){
966 primerNameVector.push_back("");
969 fastaFileNames.resize(barcodeNameVector.size());
970 for(int i=0;i<fastaFileNames.size();i++){
971 fastaFileNames[i].assign(primerNameVector.size(), "");
973 if(qFileName != ""){ qualFileNames = fastaFileNames; }
976 set<string> uniqueNames; //used to cleanup outputFileNames
977 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
978 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
980 string primerName = primerNameVector[itPrimer->second];
981 string barcodeName = barcodeNameVector[itBar->second];
983 string comboGroupName = "";
984 string fastaFileName = "";
985 string qualFileName = "";
987 if(primerName == ""){
988 comboGroupName = barcodeNameVector[itBar->second];
991 if(barcodeName == ""){
992 comboGroupName = primerNameVector[itPrimer->second];
995 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1000 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1001 if (uniqueNames.count(fastaFileName) == 0) {
1002 outputNames.push_back(fastaFileName);
1003 outputTypes["fasta"].push_back(fastaFileName);
1004 uniqueNames.insert(fastaFileName);
1007 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1008 m->openOutputFile(fastaFileName, temp); temp.close();
1010 if(qFileName != ""){
1011 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1012 if (uniqueNames.count(fastaFileName) == 0) {
1013 outputNames.push_back(qualFileName);
1014 outputTypes["qfile"].push_back(qualFileName);
1017 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1018 m->openOutputFile(qualFileName, temp); temp.close();
1023 numFPrimers = primers.size();
1024 numRPrimers = revPrimer.size();
1025 groupCounts.resize(barcodeNameVector.size(), 0);
1028 catch(exception& e) {
1029 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1034 //***************************************************************************************************************
1036 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1039 string rawSequence = seq.getUnaligned();
1040 int success = bdiffs + 1; //guilty until proven innocent
1042 //can you find the barcode
1043 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1044 string oligo = it->first;
1045 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1046 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1050 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1052 seq.setUnaligned(rawSequence.substr(oligo.length()));
1054 if(qual.getName() != ""){
1055 qual.trimQScores(oligo.length(), -1);
1063 //if you found the barcode or if you don't want to allow for diffs
1064 if ((bdiffs == 0) || (success == 0)) { return success; }
1066 else { //try aligning and see if you can find it
1070 Alignment* alignment;
1071 if (barcodes.size() > 0) {
1072 map<string,int>::iterator it=barcodes.begin();
1074 for(it;it!=barcodes.end();it++){
1075 if(it->first.length() > maxLength){
1076 maxLength = it->first.length();
1079 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1081 }else{ alignment = NULL; }
1083 //can you find the barcode
1089 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1090 string oligo = it->first;
1091 // int length = oligo.length();
1093 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1094 success = bdiffs + 10;
1098 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1099 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1100 oligo = alignment->getSeqAAln();
1101 string temp = alignment->getSeqBAln();
1103 int alnLength = oligo.length();
1105 for(int i=oligo.length()-1;i>=0;i--){
1106 if(oligo[i] != '-'){ alnLength = i+1; break; }
1108 oligo = oligo.substr(0,alnLength);
1109 temp = temp.substr(0,alnLength);
1111 int numDiff = countDiffs(oligo, temp);
1113 if(numDiff < minDiff){
1116 minGroup = it->second;
1118 for(int i=0;i<alnLength;i++){
1124 else if(numDiff == minDiff){
1130 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1131 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1132 else{ //use the best match
1134 seq.setUnaligned(rawSequence.substr(minPos));
1136 if(qual.getName() != ""){
1137 qual.trimQScores(minPos, -1);
1142 if (alignment != NULL) { delete alignment; }
1149 catch(exception& e) {
1150 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1156 //***************************************************************************************************************
1158 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1160 string rawSequence = seq.getUnaligned();
1161 int success = pdiffs + 1; //guilty until proven innocent
1163 //can you find the primer
1164 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1165 string oligo = it->first;
1166 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1167 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1171 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1173 seq.setUnaligned(rawSequence.substr(oligo.length()));
1174 if(qual.getName() != ""){
1175 qual.trimQScores(oligo.length(), -1);
1182 //if you found the barcode or if you don't want to allow for diffs
1183 if ((pdiffs == 0) || (success == 0)) { return success; }
1185 else { //try aligning and see if you can find it
1189 Alignment* alignment;
1190 if (primers.size() > 0) {
1191 map<string,int>::iterator it=primers.begin();
1193 for(it;it!=primers.end();it++){
1194 if(it->first.length() > maxLength){
1195 maxLength = it->first.length();
1198 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1200 }else{ alignment = NULL; }
1202 //can you find the barcode
1208 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1209 string oligo = it->first;
1210 // int length = oligo.length();
1212 if(rawSequence.length() < maxLength){
1213 success = pdiffs + 100;
1217 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1218 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1219 oligo = alignment->getSeqAAln();
1220 string temp = alignment->getSeqBAln();
1222 int alnLength = oligo.length();
1224 for(int i=oligo.length()-1;i>=0;i--){
1225 if(oligo[i] != '-'){ alnLength = i+1; break; }
1227 oligo = oligo.substr(0,alnLength);
1228 temp = temp.substr(0,alnLength);
1230 int numDiff = countDiffs(oligo, temp);
1232 if(numDiff < minDiff){
1235 minGroup = it->second;
1237 for(int i=0;i<alnLength;i++){
1243 else if(numDiff == minDiff){
1249 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1250 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1251 else{ //use the best match
1253 seq.setUnaligned(rawSequence.substr(minPos));
1254 if(qual.getName() != ""){
1255 qual.trimQScores(minPos, -1);
1260 if (alignment != NULL) { delete alignment; }
1267 catch(exception& e) {
1268 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1273 //***************************************************************************************************************
1275 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1277 string rawSequence = seq.getUnaligned();
1278 bool success = 0; //guilty until proven innocent
1280 for(int i=0;i<numRPrimers;i++){
1281 string oligo = revPrimer[i];
1283 if(rawSequence.length() < oligo.length()){
1288 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1289 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1290 if(qual.getName() != ""){
1291 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1300 catch(exception& e) {
1301 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1306 //***************************************************************************************************************
1308 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1311 if(qscores.getName() != ""){
1312 qscores.trimQScores(-1, keepFirst);
1314 sequence.trim(keepFirst);
1317 catch(exception& e) {
1318 m->errorOut(e, "keepFirstTrim", "countDiffs");
1324 //***************************************************************************************************************
1326 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1330 int length = sequence.getNumBases() - removeLast;
1333 if(qscores.getName() != ""){
1334 qscores.trimQScores(-1, length);
1336 sequence.trim(length);
1345 catch(exception& e) {
1346 m->errorOut(e, "removeLastTrim", "countDiffs");
1352 //***************************************************************************************************************
1354 bool TrimSeqsCommand::cullLength(Sequence& seq){
1357 int length = seq.getNumBases();
1358 bool success = 0; //guilty until proven innocent
1360 if(length >= minLength && maxLength == 0) { success = 1; }
1361 else if(length >= minLength && length <= maxLength) { success = 1; }
1362 else { success = 0; }
1367 catch(exception& e) {
1368 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1374 //***************************************************************************************************************
1376 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1378 int longHomoP = seq.getLongHomoPolymer();
1379 bool success = 0; //guilty until proven innocent
1381 if(longHomoP <= maxHomoP){ success = 1; }
1382 else { success = 0; }
1386 catch(exception& e) {
1387 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1393 //***************************************************************************************************************
1395 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1397 int numNs = seq.getAmbigBases();
1398 bool success = 0; //guilty until proven innocent
1400 if(numNs <= maxAmbig) { success = 1; }
1401 else { success = 0; }
1405 catch(exception& e) {
1406 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1412 //***************************************************************************************************************
1414 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1417 int length = oligo.length();
1419 for(int i=0;i<length;i++){
1421 if(oligo[i] != seq[i]){
1422 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1423 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1424 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1425 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1426 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1427 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1428 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1429 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1430 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1431 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1432 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1433 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1435 if(success == 0) { break; }
1444 catch(exception& e) {
1445 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1451 //***************************************************************************************************************
1453 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1456 int length = oligo.length();
1459 for(int i=0;i<length;i++){
1461 if(oligo[i] != seq[i]){
1462 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1463 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1464 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1465 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1466 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1467 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1468 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1469 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1470 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1471 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1472 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1473 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1480 catch(exception& e) {
1481 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1487 //***************************************************************************************************************