5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
20 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
21 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
22 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
23 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
24 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
25 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
26 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
27 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
30 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
31 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
32 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
33 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
34 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
35 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
36 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
37 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
38 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "TrimSeqsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string TrimSeqsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
56 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
57 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
58 helpString += "The fasta parameter is required.\n";
59 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
60 helpString += "The oligos parameter allows you to provide an oligos file.\n";
61 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
62 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
63 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
64 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
65 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
66 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
67 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
68 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
69 helpString += "The qfile parameter allows you to provide a quality file.\n";
70 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
71 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
72 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
73 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
74 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
75 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
76 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
77 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
78 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
79 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
80 helpString += "The trim.seqs command should be in the following format: \n";
81 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
82 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
83 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
84 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
85 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
89 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
95 //**********************************************************************************************************************
97 TrimSeqsCommand::TrimSeqsCommand(){
99 abort = true; calledHelp = true;
101 vector<string> tempOutNames;
102 outputTypes["fasta"] = tempOutNames;
103 outputTypes["qfile"] = tempOutNames;
104 outputTypes["group"] = tempOutNames;
105 outputTypes["name"] = tempOutNames;
107 catch(exception& e) {
108 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
112 //***************************************************************************************************************
114 TrimSeqsCommand::TrimSeqsCommand(string option) {
117 abort = false; calledHelp = false;
120 //allow user to run help
121 if(option == "help") { help(); abort = true; calledHelp = true; }
122 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
125 vector<string> myArray = setParameters();
127 OptionParser parser(option);
128 map<string,string> parameters = parser.getParameters();
130 ValidParameters validParameter;
131 map<string,string>::iterator it;
133 //check to make sure all parameters are valid for command
134 for (it = parameters.begin(); it != parameters.end(); it++) {
135 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
138 //initialize outputTypes
139 vector<string> tempOutNames;
140 outputTypes["fasta"] = tempOutNames;
141 outputTypes["qfile"] = tempOutNames;
142 outputTypes["group"] = tempOutNames;
143 outputTypes["name"] = tempOutNames;
145 //if the user changes the input directory command factory will send this info to us in the output parameter
146 string inputDir = validParameter.validFile(parameters, "inputdir", false);
147 if (inputDir == "not found"){ inputDir = ""; }
150 it = parameters.find("fasta");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["fasta"] = inputDir + it->second; }
158 it = parameters.find("oligos");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["oligos"] = inputDir + it->second; }
166 it = parameters.find("qfile");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["qfile"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
185 //check for required parameters
186 fastaFile = validParameter.validFile(parameters, "fasta", true);
187 if (fastaFile == "not found") {
188 fastaFile = m->getFastaFile();
189 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
190 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
191 }else if (fastaFile == "not open") { abort = true; }
193 //if the user changes the output directory command factory will send this info to us in the output parameter
194 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
196 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
200 //check for optional parameter and set defaults
201 // ...at some point should added some additional type checking...
203 temp = validParameter.validFile(parameters, "flip", false);
204 if (temp == "not found"){ flip = 0; }
205 else if(m->isTrue(temp)) { flip = 1; }
207 temp = validParameter.validFile(parameters, "oligos", true);
208 if (temp == "not found"){ oligoFile = ""; }
209 else if(temp == "not open"){ abort = true; }
210 else { oligoFile = temp; }
213 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
214 convert(temp, maxAmbig);
216 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
217 convert(temp, maxHomoP);
219 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
220 convert(temp, minLength);
222 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
223 convert(temp, maxLength);
225 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
226 convert(temp, bdiffs);
228 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
229 convert(temp, pdiffs);
231 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
232 convert(temp, tdiffs);
234 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
236 temp = validParameter.validFile(parameters, "qfile", true);
237 if (temp == "not found") { qFileName = ""; }
238 else if(temp == "not open") { abort = true; }
239 else { qFileName = temp; }
241 temp = validParameter.validFile(parameters, "name", true);
242 if (temp == "not found") { nameFile = ""; }
243 else if(temp == "not open") { nameFile = ""; abort = true; }
244 else { nameFile = temp; }
246 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
247 convert(temp, qThreshold);
249 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
250 qtrim = m->isTrue(temp);
252 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
253 convert(temp, qRollAverage);
255 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
256 convert(temp, qWindowAverage);
258 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
259 convert(temp, qWindowSize);
261 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
262 convert(temp, qWindowStep);
264 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
265 convert(temp, qAverage);
267 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
268 convert(temp, keepFirst);
270 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
271 convert(temp, removeLast);
273 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
274 allFiles = m->isTrue(temp);
276 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
277 m->setProcessors(temp);
278 convert(temp, processors);
281 if(allFiles && (oligoFile == "")){
282 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
284 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
285 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
289 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
290 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
296 catch(exception& e) {
297 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
301 //***************************************************************************************************************
303 int TrimSeqsCommand::execute(){
306 if (abort == true) { if (calledHelp) { return 0; } return 2; }
308 numFPrimers = 0; //this needs to be initialized
310 vector<vector<string> > fastaFileNames;
311 vector<vector<string> > qualFileNames;
312 vector<vector<string> > nameFileNames;
314 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
315 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
317 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
318 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
320 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
321 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
323 if (qFileName != "") {
324 outputNames.push_back(trimQualFile);
325 outputNames.push_back(scrapQualFile);
326 outputTypes["qfile"].push_back(trimQualFile);
327 outputTypes["qfile"].push_back(scrapQualFile);
330 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
331 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
333 if (nameFile != "") {
334 m->readNames(nameFile, nameMap);
335 outputNames.push_back(trimNameFile);
336 outputNames.push_back(scrapNameFile);
337 outputTypes["name"].push_back(trimNameFile);
338 outputTypes["name"].push_back(scrapNameFile);
341 if (m->control_pressed) { return 0; }
343 string outputGroupFileName;
345 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
346 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
347 getOligos(fastaFileNames, qualFileNames, nameFileNames);
350 vector<unsigned long int> fastaFilePos;
351 vector<unsigned long int> qFilePos;
353 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
355 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
356 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
357 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
359 if(qFileName == "") { qLines = lines; } //files with duds
361 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
363 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
365 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
368 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
371 if (m->control_pressed) { return 0; }
374 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
375 map<string, string>::iterator it;
376 set<string> namesToRemove;
377 for(int i=0;i<fastaFileNames.size();i++){
378 for(int j=0;j<fastaFileNames[0].size();j++){
379 if (fastaFileNames[i][j] != "") {
380 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
381 if(m->isBlank(fastaFileNames[i][j])){
382 remove(fastaFileNames[i][j].c_str());
383 namesToRemove.insert(fastaFileNames[i][j]);
386 remove(qualFileNames[i][j].c_str());
387 namesToRemove.insert(qualFileNames[i][j]);
391 remove(nameFileNames[i][j].c_str());
392 namesToRemove.insert(nameFileNames[i][j]);
395 it = uniqueFastaNames.find(fastaFileNames[i][j]);
396 if (it == uniqueFastaNames.end()) {
397 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
405 //remove names for outputFileNames, just cleans up the output
406 vector<string> outputNames2;
407 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
408 outputNames = outputNames2;
410 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
412 m->openInputFile(it->first, in);
415 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
416 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
417 m->openOutputFile(thisGroupName, out);
420 if (m->control_pressed) { break; }
422 Sequence currSeq(in); m->gobble(in);
423 out << currSeq.getName() << '\t' << it->second << endl;
430 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
432 //output group counts
433 m->mothurOutEndLine();
435 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
436 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
437 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
439 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
441 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
443 //set fasta file as new current fastafile
445 itTypes = outputTypes.find("fasta");
446 if (itTypes != outputTypes.end()) {
447 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
450 itTypes = outputTypes.find("name");
451 if (itTypes != outputTypes.end()) {
452 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
455 itTypes = outputTypes.find("qfile");
456 if (itTypes != outputTypes.end()) {
457 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
460 itTypes = outputTypes.find("group");
461 if (itTypes != outputTypes.end()) {
462 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
465 m->mothurOutEndLine();
466 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
467 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
468 m->mothurOutEndLine();
473 catch(exception& e) {
474 m->errorOut(e, "TrimSeqsCommand", "execute");
479 /**************************************************************************************/
481 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
485 ofstream trimFASTAFile;
486 m->openOutputFile(trimFileName, trimFASTAFile);
488 ofstream scrapFASTAFile;
489 m->openOutputFile(scrapFileName, scrapFASTAFile);
491 ofstream trimQualFile;
492 ofstream scrapQualFile;
494 m->openOutputFile(trimQFileName, trimQualFile);
495 m->openOutputFile(scrapQFileName, scrapQualFile);
498 ofstream trimNameFile;
499 ofstream scrapNameFile;
501 m->openOutputFile(trimNFileName, trimNameFile);
502 m->openOutputFile(scrapNFileName, scrapNameFile);
506 ofstream outGroupsFile;
507 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
509 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
510 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
511 if (fastaFileNames[i][j] != "") {
513 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
515 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
519 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
527 m->openInputFile(filename, inFASTA);
528 inFASTA.seekg(line->start);
531 if(qFileName != "") {
532 m->openInputFile(qFileName, qFile);
533 qFile.seekg(qline->start);
541 if (m->control_pressed) {
542 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
543 if (oligoFile != "") { outGroupsFile.close(); }
548 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
554 string trashCode = "";
555 int currentSeqsDiffs = 0;
557 Sequence currSeq(inFASTA); m->gobble(inFASTA);
558 QualityScores currQual;
560 currQual = QualityScores(qFile); m->gobble(qFile);
562 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
563 string origSeq = currSeq.getUnaligned();
566 int barcodeIndex = 0;
569 if(barcodes.size() != 0){
570 success = stripBarcode(currSeq, currQual, barcodeIndex);
571 if(success > bdiffs) { trashCode += 'b'; }
572 else{ currentSeqsDiffs += success; }
575 if(numFPrimers != 0){
576 success = stripForward(currSeq, currQual, primerIndex);
577 if(success > pdiffs) { trashCode += 'f'; }
578 else{ currentSeqsDiffs += success; }
581 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
583 if(numRPrimers != 0){
584 success = stripReverse(currSeq, currQual);
585 if(!success) { trashCode += 'r'; }
589 success = keepFirstTrim(currSeq, currQual);
593 success = removeLastTrim(currSeq, currQual);
594 if(!success) { trashCode += 'l'; }
599 int origLength = currSeq.getNumBases();
601 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
602 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
603 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
604 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
605 else { success = 1; }
607 //you don't want to trim, if it fails above then scrap it
608 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
610 if(!success) { trashCode += 'q'; }
613 if(minLength > 0 || maxLength > 0){
614 success = cullLength(currSeq);
615 if(!success) { trashCode += 'l'; }
618 success = cullHomoP(currSeq);
619 if(!success) { trashCode += 'h'; }
622 success = cullAmbigs(currSeq);
623 if(!success) { trashCode += 'n'; }
626 if(flip){ // should go last
627 currSeq.reverseComplement();
629 currQual.flipQScores();
633 if(trashCode.length() == 0){
634 currSeq.setAligned(currSeq.getUnaligned());
635 currSeq.printSequence(trimFASTAFile);
638 currQual.printQScores(trimQualFile);
642 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
643 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
644 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
647 if(barcodes.size() != 0){
648 string thisGroup = barcodeNameVector[barcodeIndex];
649 if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
651 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
653 if (nameFile != "") {
654 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
655 if (itName != nameMap.end()) {
656 vector<string> thisSeqsNames;
657 m->splitAtChar(itName->second, thisSeqsNames, ',');
658 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
659 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
661 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
664 map<string, int>::iterator it = groupCounts.find(thisGroup);
665 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
666 else { groupCounts[it->first]++; }
673 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
674 currSeq.printSequence(output);
678 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
679 currQual.printQScores(output);
684 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
685 if (itName != nameMap.end()) {
686 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
687 output << itName->first << '\t' << itName->second << endl;
689 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
694 if(nameFile != ""){ //needs to be before the currSeq name is changed
695 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
696 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
697 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
699 currSeq.setName(currSeq.getName() + '|' + trashCode);
700 currSeq.setUnaligned(origSeq);
701 currSeq.setAligned(origSeq);
702 currSeq.printSequence(scrapFASTAFile);
704 currQual.printQScores(scrapQualFile);
710 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
711 unsigned long int pos = inFASTA.tellg();
712 if ((pos == -1) || (pos >= line->end)) { break; }
715 if (inFASTA.eof()) { break; }
719 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
723 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
727 trimFASTAFile.close();
728 scrapFASTAFile.close();
729 if (oligoFile != "") { outGroupsFile.close(); }
730 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
731 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
735 catch(exception& e) {
736 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
741 /**************************************************************************************************/
743 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
745 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
750 //loop through and create all the processes you want
751 while (process != processors) {
755 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
759 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
760 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
761 vector<vector<string> > tempNameFileNames = nameFileNames;
766 for(int i=0;i<tempFASTAFileNames.size();i++){
767 for(int j=0;j<tempFASTAFileNames[i].size();j++){
768 if (tempFASTAFileNames[i][j] != "") {
769 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
770 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
773 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
774 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
777 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
778 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
785 driverCreateTrim(filename,
787 (trimFASTAFileName + toString(getpid()) + ".temp"),
788 (scrapFASTAFileName + toString(getpid()) + ".temp"),
789 (trimQualFileName + toString(getpid()) + ".temp"),
790 (scrapQualFileName + toString(getpid()) + ".temp"),
791 (trimNameFileName + toString(getpid()) + ".temp"),
792 (scrapNameFileName + toString(getpid()) + ".temp"),
793 (groupFile + toString(getpid()) + ".temp"),
795 tempPrimerQualFileNames,
800 //pass groupCounts to parent
803 string tempFile = filename + toString(getpid()) + ".num.temp";
804 m->openOutputFile(tempFile, out);
806 out << groupCounts.size() << endl;
808 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
809 out << it->first << '\t' << it->second << endl;
815 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
816 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
823 m->openOutputFile(trimFASTAFileName, temp); temp.close();
824 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
826 m->openOutputFile(trimQualFileName, temp); temp.close();
827 m->openOutputFile(scrapQualFileName, temp); temp.close();
829 if (nameFile != "") {
830 m->openOutputFile(trimNameFileName, temp); temp.close();
831 m->openOutputFile(scrapNameFileName, temp); temp.close();
834 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
836 //force parent to wait until all the processes are done
837 for (int i=0;i<processIDS.size();i++) {
838 int temp = processIDS[i];
843 for(int i=0;i<processIDS.size();i++){
845 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
847 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
848 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
849 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
850 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
853 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
854 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
855 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
856 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
860 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
861 remove((trimNameFileName + toString(processIDS[i]) + ".temp").c_str());
862 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
863 remove((scrapNameFileName + toString(processIDS[i]) + ".temp").c_str());
867 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
868 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
873 for(int j=0;j<fastaFileNames.size();j++){
874 for(int k=0;k<fastaFileNames[j].size();k++){
875 if (fastaFileNames[j][k] != "") {
876 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
877 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
880 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
881 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
885 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
886 remove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
895 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
896 m->openInputFile(tempFile, in);
900 in >> tempNum; m->gobble(in);
904 in >> group >> tempNum; m->gobble(in);
906 map<string, int>::iterator it = groupCounts.find(group);
907 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
908 else { groupCounts[it->first] += tempNum; }
911 in.close(); remove(tempFile.c_str());
919 catch(exception& e) {
920 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
925 /**************************************************************************************************/
927 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
929 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
930 //set file positions for fasta file
931 fastaFilePos = m->divideFile(filename, processors);
933 if (qfilename == "") { return processors; }
935 //get name of first sequence in each chunk
936 map<string, int> firstSeqNames;
937 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
939 m->openInputFile(filename, in);
940 in.seekg(fastaFilePos[i]);
943 firstSeqNames[temp.getName()] = i;
948 //seach for filePos of each first name in the qfile and save in qfileFilePos
950 m->openInputFile(qfilename, inQual);
953 while(!inQual.eof()){
954 input = m->getline(inQual);
956 if (input.length() != 0) {
957 if(input[0] == '>'){ //this is a sequence name line
958 istringstream nameStream(input);
960 string sname = ""; nameStream >> sname;
961 sname = sname.substr(1);
963 map<string, int>::iterator it = firstSeqNames.find(sname);
965 if(it != firstSeqNames.end()) { //this is the start of a new chunk
966 unsigned long int pos = inQual.tellg();
967 qfileFilePos.push_back(pos - input.length() - 1);
968 firstSeqNames.erase(it);
973 if (firstSeqNames.size() == 0) { break; }
978 if (firstSeqNames.size() != 0) {
979 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
980 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
986 //get last file position of qfile
988 unsigned long int size;
990 //get num bytes in file
991 pFile = fopen (qfilename.c_str(),"rb");
992 if (pFile==NULL) perror ("Error opening file");
994 fseek (pFile, 0, SEEK_END);
999 qfileFilePos.push_back(size);
1005 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1006 //get last file position of fastafile
1008 unsigned long int size;
1010 //get num bytes in file
1011 pFile = fopen (filename.c_str(),"rb");
1012 if (pFile==NULL) perror ("Error opening file");
1014 fseek (pFile, 0, SEEK_END);
1018 fastaFilePos.push_back(size);
1020 //get last file position of fastafile
1023 //get num bytes in file
1024 qFile = fopen (qfilename.c_str(),"rb");
1025 if (qFile==NULL) perror ("Error opening file");
1027 fseek (qFile, 0, SEEK_END);
1031 qfileFilePos.push_back(size);
1037 catch(exception& e) {
1038 m->errorOut(e, "TrimSeqsCommand", "setLines");
1043 //***************************************************************************************************************
1045 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1048 m->openInputFile(oligoFile, inOligos);
1052 string type, oligo, group;
1054 int indexPrimer = 0;
1055 int indexBarcode = 0;
1057 while(!inOligos.eof()){
1062 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1063 m->gobble(inOligos);
1066 m->gobble(inOligos);
1067 //make type case insensitive
1068 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1072 for(int i=0;i<oligo.length();i++){
1073 oligo[i] = toupper(oligo[i]);
1074 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1077 if(type == "FORWARD"){
1080 // get rest of line in case there is a primer name
1081 while (!inOligos.eof()) {
1082 char c = inOligos.get();
1083 if (c == 10 || c == 13){ break; }
1084 else if (c == 32 || c == 9){;} //space or tab
1085 else { group += c; }
1088 //check for repeat barcodes
1089 map<string, int>::iterator itPrime = primers.find(oligo);
1090 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1092 primers[oligo]=indexPrimer; indexPrimer++;
1093 primerNameVector.push_back(group);
1095 else if(type == "REVERSE"){
1096 Sequence oligoRC("reverse", oligo);
1097 oligoRC.reverseComplement();
1098 revPrimer.push_back(oligoRC.getUnaligned());
1100 else if(type == "BARCODE"){
1103 //check for repeat barcodes
1104 map<string, int>::iterator itBar = barcodes.find(oligo);
1105 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1107 barcodes[oligo]=indexBarcode; indexBarcode++;
1108 barcodeNameVector.push_back(group);
1110 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1112 m->gobble(inOligos);
1116 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1118 //add in potential combos
1119 if(barcodeNameVector.size() == 0){
1121 barcodeNameVector.push_back("");
1124 if(primerNameVector.size() == 0){
1126 primerNameVector.push_back("");
1129 fastaFileNames.resize(barcodeNameVector.size());
1130 for(int i=0;i<fastaFileNames.size();i++){
1131 fastaFileNames[i].assign(primerNameVector.size(), "");
1133 if(qFileName != "") { qualFileNames = fastaFileNames; }
1134 if(nameFile != "") { nameFileNames = fastaFileNames; }
1137 set<string> uniqueNames; //used to cleanup outputFileNames
1138 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1139 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1141 string primerName = primerNameVector[itPrimer->second];
1142 string barcodeName = barcodeNameVector[itBar->second];
1144 string comboGroupName = "";
1145 string fastaFileName = "";
1146 string qualFileName = "";
1147 string nameFileName = "";
1149 if(primerName == ""){
1150 comboGroupName = barcodeNameVector[itBar->second];
1153 if(barcodeName == ""){
1154 comboGroupName = primerNameVector[itPrimer->second];
1157 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1163 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1164 if (uniqueNames.count(fastaFileName) == 0) {
1165 outputNames.push_back(fastaFileName);
1166 outputTypes["fasta"].push_back(fastaFileName);
1167 uniqueNames.insert(fastaFileName);
1170 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1171 m->openOutputFile(fastaFileName, temp); temp.close();
1173 if(qFileName != ""){
1174 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1175 if (uniqueNames.count(qualFileName) == 0) {
1176 outputNames.push_back(qualFileName);
1177 outputTypes["qfile"].push_back(qualFileName);
1180 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1181 m->openOutputFile(qualFileName, temp); temp.close();
1185 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1186 if (uniqueNames.count(nameFileName) == 0) {
1187 outputNames.push_back(nameFileName);
1188 outputTypes["name"].push_back(nameFileName);
1191 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1192 m->openOutputFile(nameFileName, temp); temp.close();
1198 numFPrimers = primers.size();
1199 numRPrimers = revPrimer.size();
1202 catch(exception& e) {
1203 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1208 //***************************************************************************************************************
1210 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1213 string rawSequence = seq.getUnaligned();
1214 int success = bdiffs + 1; //guilty until proven innocent
1216 //can you find the barcode
1217 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1218 string oligo = it->first;
1219 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1220 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1224 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1226 seq.setUnaligned(rawSequence.substr(oligo.length()));
1228 if(qual.getName() != ""){
1229 qual.trimQScores(oligo.length(), -1);
1237 //if you found the barcode or if you don't want to allow for diffs
1238 if ((bdiffs == 0) || (success == 0)) { return success; }
1240 else { //try aligning and see if you can find it
1244 Alignment* alignment;
1245 if (barcodes.size() > 0) {
1246 map<string,int>::iterator it=barcodes.begin();
1248 for(it;it!=barcodes.end();it++){
1249 if(it->first.length() > maxLength){
1250 maxLength = it->first.length();
1253 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1255 }else{ alignment = NULL; }
1257 //can you find the barcode
1263 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1264 string oligo = it->first;
1265 // int length = oligo.length();
1267 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1268 success = bdiffs + 10;
1272 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1273 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1274 oligo = alignment->getSeqAAln();
1275 string temp = alignment->getSeqBAln();
1277 int alnLength = oligo.length();
1279 for(int i=oligo.length()-1;i>=0;i--){
1280 if(oligo[i] != '-'){ alnLength = i+1; break; }
1282 oligo = oligo.substr(0,alnLength);
1283 temp = temp.substr(0,alnLength);
1285 int numDiff = countDiffs(oligo, temp);
1287 if(numDiff < minDiff){
1290 minGroup = it->second;
1292 for(int i=0;i<alnLength;i++){
1298 else if(numDiff == minDiff){
1304 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1305 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1306 else{ //use the best match
1308 seq.setUnaligned(rawSequence.substr(minPos));
1310 if(qual.getName() != ""){
1311 qual.trimQScores(minPos, -1);
1316 if (alignment != NULL) { delete alignment; }
1323 catch(exception& e) {
1324 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1330 //***************************************************************************************************************
1332 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1334 string rawSequence = seq.getUnaligned();
1335 int success = pdiffs + 1; //guilty until proven innocent
1337 //can you find the primer
1338 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1339 string oligo = it->first;
1340 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1341 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1345 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1347 seq.setUnaligned(rawSequence.substr(oligo.length()));
1348 if(qual.getName() != ""){
1349 qual.trimQScores(oligo.length(), -1);
1356 //if you found the barcode or if you don't want to allow for diffs
1357 if ((pdiffs == 0) || (success == 0)) { return success; }
1359 else { //try aligning and see if you can find it
1363 Alignment* alignment;
1364 if (primers.size() > 0) {
1365 map<string,int>::iterator it=primers.begin();
1367 for(it;it!=primers.end();it++){
1368 if(it->first.length() > maxLength){
1369 maxLength = it->first.length();
1372 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1374 }else{ alignment = NULL; }
1376 //can you find the barcode
1382 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1383 string oligo = it->first;
1384 // int length = oligo.length();
1386 if(rawSequence.length() < maxLength){
1387 success = pdiffs + 100;
1391 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1392 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1393 oligo = alignment->getSeqAAln();
1394 string temp = alignment->getSeqBAln();
1396 int alnLength = oligo.length();
1398 for(int i=oligo.length()-1;i>=0;i--){
1399 if(oligo[i] != '-'){ alnLength = i+1; break; }
1401 oligo = oligo.substr(0,alnLength);
1402 temp = temp.substr(0,alnLength);
1404 int numDiff = countDiffs(oligo, temp);
1406 if(numDiff < minDiff){
1409 minGroup = it->second;
1411 for(int i=0;i<alnLength;i++){
1417 else if(numDiff == minDiff){
1423 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1424 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1425 else{ //use the best match
1427 seq.setUnaligned(rawSequence.substr(minPos));
1428 if(qual.getName() != ""){
1429 qual.trimQScores(minPos, -1);
1434 if (alignment != NULL) { delete alignment; }
1441 catch(exception& e) {
1442 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1447 //***************************************************************************************************************
1449 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1451 string rawSequence = seq.getUnaligned();
1452 bool success = 0; //guilty until proven innocent
1454 for(int i=0;i<numRPrimers;i++){
1455 string oligo = revPrimer[i];
1457 if(rawSequence.length() < oligo.length()){
1462 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1463 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1464 if(qual.getName() != ""){
1465 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1474 catch(exception& e) {
1475 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1480 //***************************************************************************************************************
1482 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1485 if(qscores.getName() != ""){
1486 qscores.trimQScores(-1, keepFirst);
1488 sequence.trim(keepFirst);
1491 catch(exception& e) {
1492 m->errorOut(e, "keepFirstTrim", "countDiffs");
1498 //***************************************************************************************************************
1500 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1504 int length = sequence.getNumBases() - removeLast;
1507 if(qscores.getName() != ""){
1508 qscores.trimQScores(-1, length);
1510 sequence.trim(length);
1519 catch(exception& e) {
1520 m->errorOut(e, "removeLastTrim", "countDiffs");
1526 //***************************************************************************************************************
1528 bool TrimSeqsCommand::cullLength(Sequence& seq){
1531 int length = seq.getNumBases();
1532 bool success = 0; //guilty until proven innocent
1534 if(length >= minLength && maxLength == 0) { success = 1; }
1535 else if(length >= minLength && length <= maxLength) { success = 1; }
1536 else { success = 0; }
1541 catch(exception& e) {
1542 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1548 //***************************************************************************************************************
1550 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1552 int longHomoP = seq.getLongHomoPolymer();
1553 bool success = 0; //guilty until proven innocent
1555 if(longHomoP <= maxHomoP){ success = 1; }
1556 else { success = 0; }
1560 catch(exception& e) {
1561 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1567 //***************************************************************************************************************
1569 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1571 int numNs = seq.getAmbigBases();
1572 bool success = 0; //guilty until proven innocent
1574 if(numNs <= maxAmbig) { success = 1; }
1575 else { success = 0; }
1579 catch(exception& e) {
1580 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1586 //***************************************************************************************************************
1588 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1591 int length = oligo.length();
1593 for(int i=0;i<length;i++){
1595 if(oligo[i] != seq[i]){
1596 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1597 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1598 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1599 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1600 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1601 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1602 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1603 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1604 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1605 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1606 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1607 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1609 if(success == 0) { break; }
1618 catch(exception& e) {
1619 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1625 //***************************************************************************************************************
1627 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1630 int length = oligo.length();
1633 for(int i=0;i<length;i++){
1635 if(oligo[i] != seq[i]){
1636 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1637 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1638 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1639 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1640 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1641 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1642 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1643 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1644 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1645 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1646 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1647 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1654 catch(exception& e) {
1655 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1661 //***************************************************************************************************************