5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
20 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
21 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
22 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
23 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
24 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
25 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
26 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
27 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
30 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
31 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
32 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
33 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
34 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
35 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
36 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
37 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
38 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "TrimSeqsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string TrimSeqsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
56 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
57 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
58 helpString += "The fasta parameter is required.\n";
59 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
60 helpString += "The oligos parameter allows you to provide an oligos file.\n";
61 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
62 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
63 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
64 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
65 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
66 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
67 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
68 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
69 helpString += "The qfile parameter allows you to provide a quality file.\n";
70 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
71 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
72 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
73 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
74 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
75 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
76 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
77 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
78 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
79 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
80 helpString += "The trim.seqs command should be in the following format: \n";
81 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
82 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
83 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
84 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
85 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
89 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
95 //**********************************************************************************************************************
97 TrimSeqsCommand::TrimSeqsCommand(){
99 abort = true; calledHelp = true;
101 vector<string> tempOutNames;
102 outputTypes["fasta"] = tempOutNames;
103 outputTypes["qfile"] = tempOutNames;
104 outputTypes["group"] = tempOutNames;
105 outputTypes["name"] = tempOutNames;
107 catch(exception& e) {
108 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
112 //***************************************************************************************************************
114 TrimSeqsCommand::TrimSeqsCommand(string option) {
117 abort = false; calledHelp = false;
120 //allow user to run help
121 if(option == "help") { help(); abort = true; calledHelp = true; }
122 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
125 vector<string> myArray = setParameters();
127 OptionParser parser(option);
128 map<string,string> parameters = parser.getParameters();
130 ValidParameters validParameter;
131 map<string,string>::iterator it;
133 //check to make sure all parameters are valid for command
134 for (it = parameters.begin(); it != parameters.end(); it++) {
135 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
138 //initialize outputTypes
139 vector<string> tempOutNames;
140 outputTypes["fasta"] = tempOutNames;
141 outputTypes["qfile"] = tempOutNames;
142 outputTypes["group"] = tempOutNames;
143 outputTypes["name"] = tempOutNames;
145 //if the user changes the input directory command factory will send this info to us in the output parameter
146 string inputDir = validParameter.validFile(parameters, "inputdir", false);
147 if (inputDir == "not found"){ inputDir = ""; }
150 it = parameters.find("fasta");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["fasta"] = inputDir + it->second; }
158 it = parameters.find("oligos");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["oligos"] = inputDir + it->second; }
166 it = parameters.find("qfile");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["qfile"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
185 //check for required parameters
186 fastaFile = validParameter.validFile(parameters, "fasta", true);
187 if (fastaFile == "not found") {
188 fastaFile = m->getFastaFile();
189 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
190 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
191 }else if (fastaFile == "not open") { abort = true; }
192 else { m->setFastaFile(fastaFile); }
194 //if the user changes the output directory command factory will send this info to us in the output parameter
195 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
197 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
201 //check for optional parameter and set defaults
202 // ...at some point should added some additional type checking...
204 temp = validParameter.validFile(parameters, "flip", false);
205 if (temp == "not found"){ flip = 0; }
206 else if(m->isTrue(temp)) { flip = 1; }
208 temp = validParameter.validFile(parameters, "oligos", true);
209 if (temp == "not found"){ oligoFile = ""; }
210 else if(temp == "not open"){ abort = true; }
211 else { oligoFile = temp; m->setOligosFile(oligoFile); }
214 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
215 convert(temp, maxAmbig);
217 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
218 convert(temp, maxHomoP);
220 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
221 convert(temp, minLength);
223 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
224 convert(temp, maxLength);
226 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
227 convert(temp, bdiffs);
229 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
230 convert(temp, pdiffs);
232 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
233 convert(temp, tdiffs);
235 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
237 temp = validParameter.validFile(parameters, "qfile", true);
238 if (temp == "not found") { qFileName = ""; }
239 else if(temp == "not open") { abort = true; }
240 else { qFileName = temp; m->setQualFile(qFileName); }
242 temp = validParameter.validFile(parameters, "name", true);
243 if (temp == "not found") { nameFile = ""; }
244 else if(temp == "not open") { nameFile = ""; abort = true; }
245 else { nameFile = temp; m->setNameFile(nameFile); }
247 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
248 convert(temp, qThreshold);
250 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
251 qtrim = m->isTrue(temp);
253 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
254 convert(temp, qRollAverage);
256 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
257 convert(temp, qWindowAverage);
259 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
260 convert(temp, qWindowSize);
262 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
263 convert(temp, qWindowStep);
265 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
266 convert(temp, qAverage);
268 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
269 convert(temp, keepFirst);
271 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
272 convert(temp, removeLast);
274 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
275 allFiles = m->isTrue(temp);
277 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
278 m->setProcessors(temp);
279 convert(temp, processors);
282 if(allFiles && (oligoFile == "")){
283 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
285 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
286 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
290 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
291 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
297 catch(exception& e) {
298 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
302 //***************************************************************************************************************
304 int TrimSeqsCommand::execute(){
307 if (abort == true) { if (calledHelp) { return 0; } return 2; }
309 numFPrimers = 0; //this needs to be initialized
312 vector<vector<string> > fastaFileNames;
313 vector<vector<string> > qualFileNames;
314 vector<vector<string> > nameFileNames;
316 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
317 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
319 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
320 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
322 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
323 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
325 if (qFileName != "") {
326 outputNames.push_back(trimQualFile);
327 outputNames.push_back(scrapQualFile);
328 outputTypes["qfile"].push_back(trimQualFile);
329 outputTypes["qfile"].push_back(scrapQualFile);
332 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
333 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
335 if (nameFile != "") {
336 m->readNames(nameFile, nameMap);
337 outputNames.push_back(trimNameFile);
338 outputNames.push_back(scrapNameFile);
339 outputTypes["name"].push_back(trimNameFile);
340 outputTypes["name"].push_back(scrapNameFile);
343 if (m->control_pressed) { return 0; }
345 string outputGroupFileName;
347 createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
349 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
350 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
354 vector<unsigned long int> fastaFilePos;
355 vector<unsigned long int> qFilePos;
357 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
359 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
360 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
361 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
363 if(qFileName == "") { qLines = lines; } //files with duds
365 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
367 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
369 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
372 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
375 if (m->control_pressed) { return 0; }
378 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
379 map<string, string>::iterator it;
380 set<string> namesToRemove;
381 for(int i=0;i<fastaFileNames.size();i++){
382 for(int j=0;j<fastaFileNames[0].size();j++){
383 if (fastaFileNames[i][j] != "") {
384 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
385 if(m->isBlank(fastaFileNames[i][j])){
386 m->mothurRemove(fastaFileNames[i][j]);
387 namesToRemove.insert(fastaFileNames[i][j]);
390 m->mothurRemove(qualFileNames[i][j]);
391 namesToRemove.insert(qualFileNames[i][j]);
395 m->mothurRemove(nameFileNames[i][j]);
396 namesToRemove.insert(nameFileNames[i][j]);
399 it = uniqueFastaNames.find(fastaFileNames[i][j]);
400 if (it == uniqueFastaNames.end()) {
401 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
409 //remove names for outputFileNames, just cleans up the output
410 vector<string> outputNames2;
411 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
412 outputNames = outputNames2;
414 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
416 m->openInputFile(it->first, in);
419 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
420 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
421 m->openOutputFile(thisGroupName, out);
424 if (m->control_pressed) { break; }
426 Sequence currSeq(in); m->gobble(in);
427 out << currSeq.getName() << '\t' << it->second << endl;
434 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
436 //output group counts
437 m->mothurOutEndLine();
439 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
440 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
441 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
443 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
445 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
447 //set fasta file as new current fastafile
449 itTypes = outputTypes.find("fasta");
450 if (itTypes != outputTypes.end()) {
451 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
454 itTypes = outputTypes.find("name");
455 if (itTypes != outputTypes.end()) {
456 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
459 itTypes = outputTypes.find("qfile");
460 if (itTypes != outputTypes.end()) {
461 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
464 itTypes = outputTypes.find("group");
465 if (itTypes != outputTypes.end()) {
466 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
469 m->mothurOutEndLine();
470 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
471 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
472 m->mothurOutEndLine();
477 catch(exception& e) {
478 m->errorOut(e, "TrimSeqsCommand", "execute");
483 /**************************************************************************************/
485 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
489 ofstream trimFASTAFile;
490 m->openOutputFile(trimFileName, trimFASTAFile);
492 ofstream scrapFASTAFile;
493 m->openOutputFile(scrapFileName, scrapFASTAFile);
495 ofstream trimQualFile;
496 ofstream scrapQualFile;
498 m->openOutputFile(trimQFileName, trimQualFile);
499 m->openOutputFile(scrapQFileName, scrapQualFile);
502 ofstream trimNameFile;
503 ofstream scrapNameFile;
505 m->openOutputFile(trimNFileName, trimNameFile);
506 m->openOutputFile(scrapNFileName, scrapNameFile);
510 ofstream outGroupsFile;
511 if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); }
513 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
514 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
515 if (fastaFileNames[i][j] != "") {
517 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
519 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
523 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
531 m->openInputFile(filename, inFASTA);
532 inFASTA.seekg(line->start);
535 if(qFileName != "") {
536 m->openInputFile(qFileName, qFile);
537 qFile.seekg(qline->start);
545 if (m->control_pressed) {
546 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
547 if (createGroup) { outGroupsFile.close(); }
552 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
558 string trashCode = "";
559 int currentSeqsDiffs = 0;
561 Sequence currSeq(inFASTA); m->gobble(inFASTA);
562 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
563 QualityScores currQual;
565 currQual = QualityScores(qFile); m->gobble(qFile);
568 string origSeq = currSeq.getUnaligned();
571 int barcodeIndex = 0;
574 if(barcodes.size() != 0){
575 success = stripBarcode(currSeq, currQual, barcodeIndex);
576 if(success > bdiffs) { trashCode += 'b'; }
577 else{ currentSeqsDiffs += success; }
580 if(numFPrimers != 0){
581 success = stripForward(currSeq, currQual, primerIndex);
582 if(success > pdiffs) { trashCode += 'f'; }
583 else{ currentSeqsDiffs += success; }
586 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
588 if(numRPrimers != 0){
589 success = stripReverse(currSeq, currQual);
590 if(!success) { trashCode += 'r'; }
594 success = keepFirstTrim(currSeq, currQual);
598 success = removeLastTrim(currSeq, currQual);
599 if(!success) { trashCode += 'l'; }
604 int origLength = currSeq.getNumBases();
606 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
607 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
608 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
609 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
610 else { success = 1; }
612 //you don't want to trim, if it fails above then scrap it
613 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
615 if(!success) { trashCode += 'q'; }
618 if(minLength > 0 || maxLength > 0){
619 success = cullLength(currSeq);
620 if(!success) { trashCode += 'l'; }
623 success = cullHomoP(currSeq);
624 if(!success) { trashCode += 'h'; }
627 success = cullAmbigs(currSeq);
628 if(!success) { trashCode += 'n'; }
631 if(flip){ // should go last
632 currSeq.reverseComplement();
634 currQual.flipQScores();
638 if(trashCode.length() == 0){
639 currSeq.setAligned(currSeq.getUnaligned());
640 currSeq.printSequence(trimFASTAFile);
643 currQual.printQScores(trimQualFile);
647 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
648 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
649 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
653 if(barcodes.size() != 0){
654 string thisGroup = barcodeNameVector[barcodeIndex];
655 if (primers.size() != 0) {
656 if (primerNameVector[primerIndex] != "") {
657 if(thisGroup != "") {
658 thisGroup += "." + primerNameVector[primerIndex];
660 thisGroup = primerNameVector[primerIndex];
665 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
667 if (nameFile != "") {
668 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
669 if (itName != nameMap.end()) {
670 vector<string> thisSeqsNames;
671 m->splitAtChar(itName->second, thisSeqsNames, ',');
672 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
673 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
675 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
678 map<string, int>::iterator it = groupCounts.find(thisGroup);
679 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
680 else { groupCounts[it->first]++; }
687 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
688 currSeq.printSequence(output);
692 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
693 currQual.printQScores(output);
698 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
699 if (itName != nameMap.end()) {
700 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
701 output << itName->first << '\t' << itName->second << endl;
703 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
708 if(nameFile != ""){ //needs to be before the currSeq name is changed
709 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
710 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
711 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
713 currSeq.setName(currSeq.getName() + '|' + trashCode);
714 currSeq.setUnaligned(origSeq);
715 currSeq.setAligned(origSeq);
716 currSeq.printSequence(scrapFASTAFile);
718 currQual.printQScores(scrapQualFile);
724 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
725 unsigned long int pos = inFASTA.tellg();
726 if ((pos == -1) || (pos >= line->end)) { break; }
729 if (inFASTA.eof()) { break; }
733 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
737 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
741 trimFASTAFile.close();
742 scrapFASTAFile.close();
743 if (createGroup) { outGroupsFile.close(); }
744 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
745 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
749 catch(exception& e) {
750 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
755 /**************************************************************************************************/
757 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
759 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
764 //loop through and create all the processes you want
765 while (process != processors) {
769 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
773 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
774 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
775 vector<vector<string> > tempNameFileNames = nameFileNames;
780 for(int i=0;i<tempFASTAFileNames.size();i++){
781 for(int j=0;j<tempFASTAFileNames[i].size();j++){
782 if (tempFASTAFileNames[i][j] != "") {
783 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
784 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
787 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
788 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
791 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
792 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
799 driverCreateTrim(filename,
801 (trimFASTAFileName + toString(getpid()) + ".temp"),
802 (scrapFASTAFileName + toString(getpid()) + ".temp"),
803 (trimQualFileName + toString(getpid()) + ".temp"),
804 (scrapQualFileName + toString(getpid()) + ".temp"),
805 (trimNameFileName + toString(getpid()) + ".temp"),
806 (scrapNameFileName + toString(getpid()) + ".temp"),
807 (groupFile + toString(getpid()) + ".temp"),
809 tempPrimerQualFileNames,
814 //pass groupCounts to parent
817 string tempFile = filename + toString(getpid()) + ".num.temp";
818 m->openOutputFile(tempFile, out);
820 out << groupCounts.size() << endl;
822 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
823 out << it->first << '\t' << it->second << endl;
829 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
830 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
837 m->openOutputFile(trimFASTAFileName, temp); temp.close();
838 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
840 m->openOutputFile(trimQualFileName, temp); temp.close();
841 m->openOutputFile(scrapQualFileName, temp); temp.close();
843 if (nameFile != "") {
844 m->openOutputFile(trimNameFileName, temp); temp.close();
845 m->openOutputFile(scrapNameFileName, temp); temp.close();
848 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
850 //force parent to wait until all the processes are done
851 for (int i=0;i<processIDS.size();i++) {
852 int temp = processIDS[i];
857 for(int i=0;i<processIDS.size();i++){
859 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
861 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
862 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
863 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
864 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
867 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
868 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
869 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
870 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
874 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
875 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
876 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
877 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
881 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
882 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
887 for(int j=0;j<fastaFileNames.size();j++){
888 for(int k=0;k<fastaFileNames[j].size();k++){
889 if (fastaFileNames[j][k] != "") {
890 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
891 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
894 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
895 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
899 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
900 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
909 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
910 m->openInputFile(tempFile, in);
914 in >> tempNum; m->gobble(in);
918 in >> group >> tempNum; m->gobble(in);
920 map<string, int>::iterator it = groupCounts.find(group);
921 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
922 else { groupCounts[it->first] += tempNum; }
925 in.close(); m->mothurRemove(tempFile);
933 catch(exception& e) {
934 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
939 /**************************************************************************************************/
941 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
943 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
944 //set file positions for fasta file
945 fastaFilePos = m->divideFile(filename, processors);
947 if (qfilename == "") { return processors; }
949 //get name of first sequence in each chunk
950 map<string, int> firstSeqNames;
951 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
953 m->openInputFile(filename, in);
954 in.seekg(fastaFilePos[i]);
957 firstSeqNames[temp.getName()] = i;
962 //seach for filePos of each first name in the qfile and save in qfileFilePos
964 m->openInputFile(qfilename, inQual);
967 while(!inQual.eof()){
968 input = m->getline(inQual);
970 if (input.length() != 0) {
971 if(input[0] == '>'){ //this is a sequence name line
972 istringstream nameStream(input);
974 string sname = ""; nameStream >> sname;
975 sname = sname.substr(1);
977 map<string, int>::iterator it = firstSeqNames.find(sname);
979 if(it != firstSeqNames.end()) { //this is the start of a new chunk
980 unsigned long int pos = inQual.tellg();
981 qfileFilePos.push_back(pos - input.length() - 1);
982 firstSeqNames.erase(it);
987 if (firstSeqNames.size() == 0) { break; }
992 if (firstSeqNames.size() != 0) {
993 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
994 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
1000 //get last file position of qfile
1002 unsigned long int size;
1004 //get num bytes in file
1005 pFile = fopen (qfilename.c_str(),"rb");
1006 if (pFile==NULL) perror ("Error opening file");
1008 fseek (pFile, 0, SEEK_END);
1013 qfileFilePos.push_back(size);
1019 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1020 //get last file position of fastafile
1022 unsigned long int size;
1024 //get num bytes in file
1025 pFile = fopen (filename.c_str(),"rb");
1026 if (pFile==NULL) perror ("Error opening file");
1028 fseek (pFile, 0, SEEK_END);
1032 fastaFilePos.push_back(size);
1034 //get last file position of fastafile
1037 //get num bytes in file
1038 qFile = fopen (qfilename.c_str(),"rb");
1039 if (qFile==NULL) perror ("Error opening file");
1041 fseek (qFile, 0, SEEK_END);
1045 qfileFilePos.push_back(size);
1051 catch(exception& e) {
1052 m->errorOut(e, "TrimSeqsCommand", "setLines");
1057 //***************************************************************************************************************
1059 bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1062 m->openInputFile(oligoFile, inOligos);
1066 string type, oligo, group;
1068 int indexPrimer = 0;
1069 int indexBarcode = 0;
1071 while(!inOligos.eof()){
1076 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1077 m->gobble(inOligos);
1080 m->gobble(inOligos);
1081 //make type case insensitive
1082 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1086 for(int i=0;i<oligo.length();i++){
1087 oligo[i] = toupper(oligo[i]);
1088 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1091 if(type == "FORWARD"){
1094 // get rest of line in case there is a primer name
1095 while (!inOligos.eof()) {
1096 char c = inOligos.get();
1097 if (c == 10 || c == 13){ break; }
1098 else if (c == 32 || c == 9){;} //space or tab
1099 else { group += c; }
1102 //check for repeat barcodes
1103 map<string, int>::iterator itPrime = primers.find(oligo);
1104 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1106 primers[oligo]=indexPrimer; indexPrimer++;
1107 primerNameVector.push_back(group);
1109 else if(type == "REVERSE"){
1110 Sequence oligoRC("reverse", oligo);
1111 oligoRC.reverseComplement();
1112 revPrimer.push_back(oligoRC.getUnaligned());
1114 else if(type == "BARCODE"){
1117 //check for repeat barcodes
1118 map<string, int>::iterator itBar = barcodes.find(oligo);
1119 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1121 barcodes[oligo]=indexBarcode; indexBarcode++;
1122 barcodeNameVector.push_back(group);
1124 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1126 m->gobble(inOligos);
1130 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1132 //add in potential combos
1133 if(barcodeNameVector.size() == 0){
1135 barcodeNameVector.push_back("");
1138 if(primerNameVector.size() == 0){
1140 primerNameVector.push_back("");
1143 fastaFileNames.resize(barcodeNameVector.size());
1144 for(int i=0;i<fastaFileNames.size();i++){
1145 fastaFileNames[i].assign(primerNameVector.size(), "");
1147 if(qFileName != "") { qualFileNames = fastaFileNames; }
1148 if(nameFile != "") { nameFileNames = fastaFileNames; }
1151 set<string> uniqueNames; //used to cleanup outputFileNames
1152 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1153 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1155 string primerName = primerNameVector[itPrimer->second];
1156 string barcodeName = barcodeNameVector[itBar->second];
1158 string comboGroupName = "";
1159 string fastaFileName = "";
1160 string qualFileName = "";
1161 string nameFileName = "";
1163 if(primerName == ""){
1164 comboGroupName = barcodeNameVector[itBar->second];
1167 if(barcodeName == ""){
1168 comboGroupName = primerNameVector[itPrimer->second];
1171 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1177 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1178 if (uniqueNames.count(fastaFileName) == 0) {
1179 outputNames.push_back(fastaFileName);
1180 outputTypes["fasta"].push_back(fastaFileName);
1181 uniqueNames.insert(fastaFileName);
1184 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1185 m->openOutputFile(fastaFileName, temp); temp.close();
1187 if(qFileName != ""){
1188 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1189 if (uniqueNames.count(qualFileName) == 0) {
1190 outputNames.push_back(qualFileName);
1191 outputTypes["qfile"].push_back(qualFileName);
1194 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1195 m->openOutputFile(qualFileName, temp); temp.close();
1199 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1200 if (uniqueNames.count(nameFileName) == 0) {
1201 outputNames.push_back(nameFileName);
1202 outputTypes["name"].push_back(nameFileName);
1205 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1206 m->openOutputFile(nameFileName, temp); temp.close();
1212 numFPrimers = primers.size();
1213 numRPrimers = revPrimer.size();
1215 bool allBlank = true;
1216 for (int i = 0; i < barcodeNameVector.size(); i++) {
1217 if (barcodeNameVector[i] != "") {
1222 for (int i = 0; i < primerNameVector.size(); i++) {
1223 if (primerNameVector[i] != "") {
1230 m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine();
1238 catch(exception& e) {
1239 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1244 //***************************************************************************************************************
1246 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1249 string rawSequence = seq.getUnaligned();
1250 int success = bdiffs + 1; //guilty until proven innocent
1252 //can you find the barcode
1253 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1254 string oligo = it->first;
1255 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1256 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1260 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1262 seq.setUnaligned(rawSequence.substr(oligo.length()));
1264 if(qual.getName() != ""){
1265 qual.trimQScores(oligo.length(), -1);
1273 //if you found the barcode or if you don't want to allow for diffs
1274 if ((bdiffs == 0) || (success == 0)) { return success; }
1276 else { //try aligning and see if you can find it
1280 Alignment* alignment;
1281 if (barcodes.size() > 0) {
1282 map<string,int>::iterator it=barcodes.begin();
1284 for(it;it!=barcodes.end();it++){
1285 if(it->first.length() > maxLength){
1286 maxLength = it->first.length();
1289 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1291 }else{ alignment = NULL; }
1293 //can you find the barcode
1299 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1300 string oligo = it->first;
1301 // int length = oligo.length();
1303 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1304 success = bdiffs + 10;
1308 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1309 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1310 oligo = alignment->getSeqAAln();
1311 string temp = alignment->getSeqBAln();
1313 int alnLength = oligo.length();
1315 for(int i=oligo.length()-1;i>=0;i--){
1316 if(oligo[i] != '-'){ alnLength = i+1; break; }
1318 oligo = oligo.substr(0,alnLength);
1319 temp = temp.substr(0,alnLength);
1321 int numDiff = countDiffs(oligo, temp);
1323 if(numDiff < minDiff){
1326 minGroup = it->second;
1328 for(int i=0;i<alnLength;i++){
1334 else if(numDiff == minDiff){
1340 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1341 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1342 else{ //use the best match
1344 seq.setUnaligned(rawSequence.substr(minPos));
1346 if(qual.getName() != ""){
1347 qual.trimQScores(minPos, -1);
1352 if (alignment != NULL) { delete alignment; }
1359 catch(exception& e) {
1360 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1366 //***************************************************************************************************************
1368 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1370 string rawSequence = seq.getUnaligned();
1371 int success = pdiffs + 1; //guilty until proven innocent
1373 //can you find the primer
1374 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1375 string oligo = it->first;
1376 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1377 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1381 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1383 seq.setUnaligned(rawSequence.substr(oligo.length()));
1384 if(qual.getName() != ""){
1385 qual.trimQScores(oligo.length(), -1);
1392 //if you found the barcode or if you don't want to allow for diffs
1393 if ((pdiffs == 0) || (success == 0)) { return success; }
1395 else { //try aligning and see if you can find it
1399 Alignment* alignment;
1400 if (primers.size() > 0) {
1401 map<string,int>::iterator it=primers.begin();
1403 for(it;it!=primers.end();it++){
1404 if(it->first.length() > maxLength){
1405 maxLength = it->first.length();
1408 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1410 }else{ alignment = NULL; }
1412 //can you find the barcode
1418 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1419 string oligo = it->first;
1420 // int length = oligo.length();
1422 if(rawSequence.length() < maxLength){
1423 success = pdiffs + 100;
1427 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1428 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1429 oligo = alignment->getSeqAAln();
1430 string temp = alignment->getSeqBAln();
1432 int alnLength = oligo.length();
1434 for(int i=oligo.length()-1;i>=0;i--){
1435 if(oligo[i] != '-'){ alnLength = i+1; break; }
1437 oligo = oligo.substr(0,alnLength);
1438 temp = temp.substr(0,alnLength);
1440 int numDiff = countDiffs(oligo, temp);
1442 if(numDiff < minDiff){
1445 minGroup = it->second;
1447 for(int i=0;i<alnLength;i++){
1453 else if(numDiff == minDiff){
1459 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1460 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1461 else{ //use the best match
1463 seq.setUnaligned(rawSequence.substr(minPos));
1464 if(qual.getName() != ""){
1465 qual.trimQScores(minPos, -1);
1470 if (alignment != NULL) { delete alignment; }
1477 catch(exception& e) {
1478 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1483 //***************************************************************************************************************
1485 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1487 string rawSequence = seq.getUnaligned();
1488 bool success = 0; //guilty until proven innocent
1490 for(int i=0;i<numRPrimers;i++){
1491 string oligo = revPrimer[i];
1493 if(rawSequence.length() < oligo.length()){
1498 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1499 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1500 if(qual.getName() != ""){
1501 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1510 catch(exception& e) {
1511 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1516 //***************************************************************************************************************
1518 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1521 if(qscores.getName() != ""){
1522 qscores.trimQScores(-1, keepFirst);
1524 sequence.trim(keepFirst);
1527 catch(exception& e) {
1528 m->errorOut(e, "keepFirstTrim", "countDiffs");
1534 //***************************************************************************************************************
1536 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1540 int length = sequence.getNumBases() - removeLast;
1543 if(qscores.getName() != ""){
1544 qscores.trimQScores(-1, length);
1546 sequence.trim(length);
1555 catch(exception& e) {
1556 m->errorOut(e, "removeLastTrim", "countDiffs");
1562 //***************************************************************************************************************
1564 bool TrimSeqsCommand::cullLength(Sequence& seq){
1567 int length = seq.getNumBases();
1568 bool success = 0; //guilty until proven innocent
1570 if(length >= minLength && maxLength == 0) { success = 1; }
1571 else if(length >= minLength && length <= maxLength) { success = 1; }
1572 else { success = 0; }
1577 catch(exception& e) {
1578 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1584 //***************************************************************************************************************
1586 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1588 int longHomoP = seq.getLongHomoPolymer();
1589 bool success = 0; //guilty until proven innocent
1591 if(longHomoP <= maxHomoP){ success = 1; }
1592 else { success = 0; }
1596 catch(exception& e) {
1597 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1603 //***************************************************************************************************************
1605 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1607 int numNs = seq.getAmbigBases();
1608 bool success = 0; //guilty until proven innocent
1610 if(numNs <= maxAmbig) { success = 1; }
1611 else { success = 0; }
1615 catch(exception& e) {
1616 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1622 //***************************************************************************************************************
1624 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1627 int length = oligo.length();
1629 for(int i=0;i<length;i++){
1631 if(oligo[i] != seq[i]){
1632 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1633 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1634 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1635 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1636 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1637 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1638 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1639 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1640 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1641 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1642 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1643 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1645 if(success == 0) { break; }
1654 catch(exception& e) {
1655 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1661 //***************************************************************************************************************
1663 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1666 int length = oligo.length();
1669 for(int i=0;i<length;i++){
1671 if(oligo[i] != seq[i]){
1672 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1673 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1674 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1675 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1676 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1677 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1678 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1679 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1680 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1681 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1682 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1683 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1690 catch(exception& e) {
1691 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1697 //***************************************************************************************************************