5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
20 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
21 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
22 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
23 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
24 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
25 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
26 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
27 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
30 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
31 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
32 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
33 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
34 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
35 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
36 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
37 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
38 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "TrimSeqsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string TrimSeqsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
56 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
57 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
58 helpString += "The fasta parameter is required.\n";
59 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
60 helpString += "The oligos parameter allows you to provide an oligos file.\n";
61 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
62 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
63 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
64 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
65 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
66 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
67 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
68 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
69 helpString += "The qfile parameter allows you to provide a quality file.\n";
70 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
71 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
72 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
73 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
74 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
75 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
76 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
77 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
78 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
79 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
80 helpString += "The trim.seqs command should be in the following format: \n";
81 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
82 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
83 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
84 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
85 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
89 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
95 //**********************************************************************************************************************
97 TrimSeqsCommand::TrimSeqsCommand(){
99 abort = true; calledHelp = true;
101 vector<string> tempOutNames;
102 outputTypes["fasta"] = tempOutNames;
103 outputTypes["qfile"] = tempOutNames;
104 outputTypes["group"] = tempOutNames;
105 outputTypes["name"] = tempOutNames;
107 catch(exception& e) {
108 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
112 //***************************************************************************************************************
114 TrimSeqsCommand::TrimSeqsCommand(string option) {
117 abort = false; calledHelp = false;
120 //allow user to run help
121 if(option == "help") { help(); abort = true; calledHelp = true; }
122 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
125 vector<string> myArray = setParameters();
127 OptionParser parser(option);
128 map<string,string> parameters = parser.getParameters();
130 ValidParameters validParameter;
131 map<string,string>::iterator it;
133 //check to make sure all parameters are valid for command
134 for (it = parameters.begin(); it != parameters.end(); it++) {
135 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
138 //initialize outputTypes
139 vector<string> tempOutNames;
140 outputTypes["fasta"] = tempOutNames;
141 outputTypes["qfile"] = tempOutNames;
142 outputTypes["group"] = tempOutNames;
143 outputTypes["name"] = tempOutNames;
145 //if the user changes the input directory command factory will send this info to us in the output parameter
146 string inputDir = validParameter.validFile(parameters, "inputdir", false);
147 if (inputDir == "not found"){ inputDir = ""; }
150 it = parameters.find("fasta");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["fasta"] = inputDir + it->second; }
158 it = parameters.find("oligos");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["oligos"] = inputDir + it->second; }
166 it = parameters.find("qfile");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["qfile"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
185 //check for required parameters
186 fastaFile = validParameter.validFile(parameters, "fasta", true);
187 if (fastaFile == "not found") {
188 fastaFile = m->getFastaFile();
189 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
190 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
191 }else if (fastaFile == "not open") { abort = true; }
193 //if the user changes the output directory command factory will send this info to us in the output parameter
194 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
196 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
200 //check for optional parameter and set defaults
201 // ...at some point should added some additional type checking...
203 temp = validParameter.validFile(parameters, "flip", false);
204 if (temp == "not found"){ flip = 0; }
205 else if(m->isTrue(temp)) { flip = 1; }
207 temp = validParameter.validFile(parameters, "oligos", true);
208 if (temp == "not found"){ oligoFile = ""; }
209 else if(temp == "not open"){ abort = true; }
210 else { oligoFile = temp; }
213 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
214 convert(temp, maxAmbig);
216 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
217 convert(temp, maxHomoP);
219 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
220 convert(temp, minLength);
222 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
223 convert(temp, maxLength);
225 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
226 convert(temp, bdiffs);
228 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
229 convert(temp, pdiffs);
231 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
232 convert(temp, tdiffs);
234 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
236 temp = validParameter.validFile(parameters, "qfile", true);
237 if (temp == "not found") { qFileName = ""; }
238 else if(temp == "not open") { abort = true; }
239 else { qFileName = temp; }
241 temp = validParameter.validFile(parameters, "name", true);
242 if (temp == "not found") { nameFile = ""; }
243 else if(temp == "not open") { abort = true; }
244 else { nameFile = temp; }
246 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
247 convert(temp, qThreshold);
249 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
250 qtrim = m->isTrue(temp);
252 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
253 convert(temp, qRollAverage);
255 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
256 convert(temp, qWindowAverage);
258 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
259 convert(temp, qWindowSize);
261 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
262 convert(temp, qWindowStep);
264 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
265 convert(temp, qAverage);
267 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
268 convert(temp, keepFirst);
270 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
271 convert(temp, removeLast);
273 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
274 allFiles = m->isTrue(temp);
276 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
277 m->setProcessors(temp);
278 convert(temp, processors);
281 if(allFiles && (oligoFile == "")){
282 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
284 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
285 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
289 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
290 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
296 catch(exception& e) {
297 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
301 //***************************************************************************************************************
303 int TrimSeqsCommand::execute(){
306 if (abort == true) { if (calledHelp) { return 0; } return 2; }
308 numFPrimers = 0; //this needs to be initialized
310 vector<vector<string> > fastaFileNames;
311 vector<vector<string> > qualFileNames;
312 vector<vector<string> > nameFileNames;
314 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
315 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
317 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
318 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
320 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
321 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
323 if (qFileName != "") {
324 outputNames.push_back(trimQualFile);
325 outputNames.push_back(scrapQualFile);
326 outputTypes["qfile"].push_back(trimQualFile);
327 outputTypes["qfile"].push_back(scrapQualFile);
330 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
331 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
333 if (nameFile != "") {
334 m->readNames(nameFile, nameMap);
335 outputNames.push_back(trimNameFile);
336 outputNames.push_back(scrapNameFile);
337 outputTypes["name"].push_back(trimNameFile);
338 outputTypes["name"].push_back(scrapNameFile);
341 if (m->control_pressed) { return 0; }
343 string outputGroupFileName;
345 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
346 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
347 getOligos(fastaFileNames, qualFileNames, nameFileNames);
350 vector<unsigned long int> fastaFilePos;
351 vector<unsigned long int> qFilePos;
353 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
355 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
356 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
357 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
359 if(qFileName == "") { qLines = lines; } //files with duds
361 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
363 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
365 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
368 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
371 if (m->control_pressed) { return 0; }
374 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
375 map<string, string>::iterator it;
376 set<string> namesToRemove;
377 for(int i=0;i<fastaFileNames.size();i++){
378 for(int j=0;j<fastaFileNames[0].size();j++){
379 if (fastaFileNames[i][j] != "") {
380 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
381 if(m->isBlank(fastaFileNames[i][j])){
382 remove(fastaFileNames[i][j].c_str());
383 namesToRemove.insert(fastaFileNames[i][j]);
386 remove(qualFileNames[i][j].c_str());
387 namesToRemove.insert(qualFileNames[i][j]);
391 remove(nameFileNames[i][j].c_str());
392 namesToRemove.insert(nameFileNames[i][j]);
395 it = uniqueFastaNames.find(fastaFileNames[i][j]);
396 if (it == uniqueFastaNames.end()) {
397 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
405 //remove names for outputFileNames, just cleans up the output
406 vector<string> outputNames2;
407 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
408 outputNames = outputNames2;
410 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
412 m->openInputFile(it->first, in);
415 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
416 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
417 m->openOutputFile(thisGroupName, out);
420 if (m->control_pressed) { break; }
422 Sequence currSeq(in); m->gobble(in);
423 out << currSeq.getName() << '\t' << it->second << endl;
430 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
432 //output group counts
433 m->mothurOutEndLine();
435 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
436 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
437 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
439 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
441 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
443 //set fasta file as new current fastafile
445 itTypes = outputTypes.find("fasta");
446 if (itTypes != outputTypes.end()) {
447 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
450 itTypes = outputTypes.find("name");
451 if (itTypes != outputTypes.end()) {
452 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
455 itTypes = outputTypes.find("qfile");
456 if (itTypes != outputTypes.end()) {
457 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
460 itTypes = outputTypes.find("group");
461 if (itTypes != outputTypes.end()) {
462 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
465 m->mothurOutEndLine();
466 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
467 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
468 m->mothurOutEndLine();
473 catch(exception& e) {
474 m->errorOut(e, "TrimSeqsCommand", "execute");
479 /**************************************************************************************/
481 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
485 ofstream trimFASTAFile;
486 m->openOutputFile(trimFileName, trimFASTAFile);
488 ofstream scrapFASTAFile;
489 m->openOutputFile(scrapFileName, scrapFASTAFile);
491 ofstream trimQualFile;
492 ofstream scrapQualFile;
494 m->openOutputFile(trimQFileName, trimQualFile);
495 m->openOutputFile(scrapQFileName, scrapQualFile);
498 ofstream trimNameFile;
499 ofstream scrapNameFile;
501 m->openOutputFile(trimNFileName, trimNameFile);
502 m->openOutputFile(scrapNFileName, scrapNameFile);
506 ofstream outGroupsFile;
507 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
509 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
510 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
511 if (fastaFileNames[i][j] != "") {
513 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
515 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
519 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
527 m->openInputFile(filename, inFASTA);
528 inFASTA.seekg(line->start);
531 if(qFileName != "") {
532 m->openInputFile(qFileName, qFile);
533 qFile.seekg(qline->start);
541 if (m->control_pressed) {
542 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
543 if (oligoFile != "") { outGroupsFile.close(); }
548 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
554 string trashCode = "";
555 int currentSeqsDiffs = 0;
557 Sequence currSeq(inFASTA); m->gobble(inFASTA);
559 QualityScores currQual;
561 currQual = QualityScores(qFile); m->gobble(qFile);
564 string origSeq = currSeq.getUnaligned();
567 int barcodeIndex = 0;
570 if(barcodes.size() != 0){
571 success = stripBarcode(currSeq, currQual, barcodeIndex);
572 if(success > bdiffs) { trashCode += 'b'; }
573 else{ currentSeqsDiffs += success; }
576 if(numFPrimers != 0){
577 success = stripForward(currSeq, currQual, primerIndex);
578 if(success > pdiffs) { trashCode += 'f'; }
579 else{ currentSeqsDiffs += success; }
582 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
584 if(numRPrimers != 0){
585 success = stripReverse(currSeq, currQual);
586 if(!success) { trashCode += 'r'; }
590 success = keepFirstTrim(currSeq, currQual);
594 success = removeLastTrim(currSeq, currQual);
595 if(!success) { trashCode += 'l'; }
600 int origLength = currSeq.getNumBases();
602 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
603 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
604 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
605 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
606 else { success = 1; }
608 //you don't want to trim, if it fails above then scrap it
609 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
611 if(!success) { trashCode += 'q'; }
614 if(minLength > 0 || maxLength > 0){
615 success = cullLength(currSeq);
616 if(!success) { trashCode += 'l'; }
619 success = cullHomoP(currSeq);
620 if(!success) { trashCode += 'h'; }
623 success = cullAmbigs(currSeq);
624 if(!success) { trashCode += 'n'; }
627 if(flip){ // should go last
628 currSeq.reverseComplement();
630 currQual.flipQScores();
634 if(trashCode.length() == 0){
635 currSeq.setAligned(currSeq.getUnaligned());
636 currSeq.printSequence(trimFASTAFile);
639 currQual.printQScores(trimQualFile);
643 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
644 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
645 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
648 if(barcodes.size() != 0){
649 string thisGroup = barcodeNameVector[barcodeIndex];
650 if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
652 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
654 if (nameFile != "") {
655 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
656 if (itName != nameMap.end()) {
657 vector<string> thisSeqsNames;
658 m->splitAtChar(itName->second, thisSeqsNames, ',');
659 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
660 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
662 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
665 map<string, int>::iterator it = groupCounts.find(thisGroup);
666 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
667 else { groupCounts[it->first]++; }
674 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
675 currSeq.printSequence(output);
679 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
680 currQual.printQScores(output);
685 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
686 if (itName != nameMap.end()) {
687 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
688 output << itName->first << '\t' << itName->second << endl;
690 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
695 if(nameFile != ""){ //needs to be before the currSeq name is changed
696 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
697 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
698 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
700 currSeq.setName(currSeq.getName() + '|' + trashCode);
701 currSeq.setUnaligned(origSeq);
702 currSeq.setAligned(origSeq);
703 currSeq.printSequence(scrapFASTAFile);
705 currQual.printQScores(scrapQualFile);
711 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
712 unsigned long int pos = inFASTA.tellg();
713 if ((pos == -1) || (pos >= line->end)) { break; }
715 if (inFASTA.eof()) { break; }
719 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
723 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
727 trimFASTAFile.close();
728 scrapFASTAFile.close();
729 if (oligoFile != "") { outGroupsFile.close(); }
730 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
731 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
735 catch(exception& e) {
736 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
741 /**************************************************************************************************/
743 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
745 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
750 //loop through and create all the processes you want
751 while (process != processors) {
755 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
759 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
760 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
761 vector<vector<string> > tempNameFileNames = nameFileNames;
766 for(int i=0;i<tempFASTAFileNames.size();i++){
767 for(int j=0;j<tempFASTAFileNames[i].size();j++){
768 if (tempFASTAFileNames[i][j] != "") {
769 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
770 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
773 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
774 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
777 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
778 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
785 driverCreateTrim(filename,
787 (trimFASTAFileName + toString(getpid()) + ".temp"),
788 (scrapFASTAFileName + toString(getpid()) + ".temp"),
789 (trimQualFileName + toString(getpid()) + ".temp"),
790 (scrapQualFileName + toString(getpid()) + ".temp"),
791 (trimNameFileName + toString(getpid()) + ".temp"),
792 (scrapNameFileName + toString(getpid()) + ".temp"),
793 (groupFile + toString(getpid()) + ".temp"),
795 tempPrimerQualFileNames,
800 //pass groupCounts to parent
802 string tempFile = filename + toString(getpid()) + ".num.temp";
803 m->openOutputFile(tempFile, out);
804 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
805 out << it->first << '\t' << it->second << endl;
811 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
812 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
819 m->openOutputFile(trimFASTAFileName, temp); temp.close();
820 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
821 m->openOutputFile(trimQualFileName, temp); temp.close();
822 m->openOutputFile(scrapQualFileName, temp); temp.close();
823 m->openOutputFile(trimNameFileName, temp); temp.close();
824 m->openOutputFile(scrapNameFileName, temp); temp.close();
826 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
828 //force parent to wait until all the processes are done
829 for (int i=0;i<processIDS.size();i++) {
830 int temp = processIDS[i];
835 for(int i=0;i<processIDS.size();i++){
837 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
839 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
840 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
841 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
842 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
845 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
846 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
847 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
848 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
852 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
853 remove((trimNameFileName + toString(processIDS[i]) + ".temp").c_str());
854 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
855 remove((scrapNameFileName + toString(processIDS[i]) + ".temp").c_str());
858 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
859 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
863 for(int j=0;j<fastaFileNames.size();j++){
864 for(int k=0;k<fastaFileNames[j].size();k++){
865 if (fastaFileNames[j][k] != "") {
866 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
867 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
870 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
871 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
875 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
876 remove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
884 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
885 m->openInputFile(tempFile, in);
889 in >> group >> tempNum; m->gobble(in);
891 map<string, int>::iterator it = groupCounts.find(group);
892 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
893 else { groupCounts[it->first] += tempNum; }
895 in.close(); remove(tempFile.c_str());
902 catch(exception& e) {
903 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
908 /**************************************************************************************************/
910 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
912 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
913 //set file positions for fasta file
914 fastaFilePos = m->divideFile(filename, processors);
916 if (qfilename == "") { return processors; }
918 //get name of first sequence in each chunk
919 map<string, int> firstSeqNames;
920 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
922 m->openInputFile(filename, in);
923 in.seekg(fastaFilePos[i]);
926 firstSeqNames[temp.getName()] = i;
931 //seach for filePos of each first name in the qfile and save in qfileFilePos
933 m->openInputFile(qfilename, inQual);
936 while(!inQual.eof()){
937 input = m->getline(inQual);
939 if (input.length() != 0) {
940 if(input[0] == '>'){ //this is a sequence name line
941 istringstream nameStream(input);
943 string sname = ""; nameStream >> sname;
944 sname = sname.substr(1);
946 map<string, int>::iterator it = firstSeqNames.find(sname);
948 if(it != firstSeqNames.end()) { //this is the start of a new chunk
949 unsigned long int pos = inQual.tellg();
950 qfileFilePos.push_back(pos - input.length() - 1);
951 firstSeqNames.erase(it);
956 if (firstSeqNames.size() == 0) { break; }
961 if (firstSeqNames.size() != 0) {
962 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
963 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
969 //get last file position of qfile
971 unsigned long int size;
973 //get num bytes in file
974 pFile = fopen (qfilename.c_str(),"rb");
975 if (pFile==NULL) perror ("Error opening file");
977 fseek (pFile, 0, SEEK_END);
982 qfileFilePos.push_back(size);
988 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
989 //get last file position of fastafile
991 unsigned long int size;
993 //get num bytes in file
994 pFile = fopen (filename.c_str(),"rb");
995 if (pFile==NULL) perror ("Error opening file");
997 fseek (pFile, 0, SEEK_END);
1001 fastaFilePos.push_back(size);
1003 //get last file position of fastafile
1006 //get num bytes in file
1007 qFile = fopen (qfilename.c_str(),"rb");
1008 if (qFile==NULL) perror ("Error opening file");
1010 fseek (qFile, 0, SEEK_END);
1014 qfileFilePos.push_back(size);
1020 catch(exception& e) {
1021 m->errorOut(e, "TrimSeqsCommand", "setLines");
1026 //***************************************************************************************************************
1028 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1031 m->openInputFile(oligoFile, inOligos);
1035 string type, oligo, group;
1037 int indexPrimer = 0;
1038 int indexBarcode = 0;
1040 while(!inOligos.eof()){
1045 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1046 m->gobble(inOligos);
1049 m->gobble(inOligos);
1050 //make type case insensitive
1051 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1055 for(int i=0;i<oligo.length();i++){
1056 oligo[i] = toupper(oligo[i]);
1057 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1060 if(type == "FORWARD"){
1063 // get rest of line in case there is a primer name
1064 while (!inOligos.eof()) {
1065 char c = inOligos.get();
1066 if (c == 10 || c == 13){ break; }
1067 else if (c == 32 || c == 9){;} //space or tab
1068 else { group += c; }
1071 //check for repeat barcodes
1072 map<string, int>::iterator itPrime = primers.find(oligo);
1073 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1075 primers[oligo]=indexPrimer; indexPrimer++;
1076 primerNameVector.push_back(group);
1078 else if(type == "REVERSE"){
1079 Sequence oligoRC("reverse", oligo);
1080 oligoRC.reverseComplement();
1081 revPrimer.push_back(oligoRC.getUnaligned());
1083 else if(type == "BARCODE"){
1086 //check for repeat barcodes
1087 map<string, int>::iterator itBar = barcodes.find(oligo);
1088 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1090 barcodes[oligo]=indexBarcode; indexBarcode++;
1091 barcodeNameVector.push_back(group);
1093 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1095 m->gobble(inOligos);
1099 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1101 //add in potential combos
1102 if(barcodeNameVector.size() == 0){
1104 barcodeNameVector.push_back("");
1107 if(primerNameVector.size() == 0){
1109 primerNameVector.push_back("");
1112 fastaFileNames.resize(barcodeNameVector.size());
1113 for(int i=0;i<fastaFileNames.size();i++){
1114 fastaFileNames[i].assign(primerNameVector.size(), "");
1116 if(qFileName != "") { qualFileNames = fastaFileNames; }
1117 if(nameFile != "") { nameFileNames = fastaFileNames; }
1120 set<string> uniqueNames; //used to cleanup outputFileNames
1121 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1122 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1124 string primerName = primerNameVector[itPrimer->second];
1125 string barcodeName = barcodeNameVector[itBar->second];
1127 string comboGroupName = "";
1128 string fastaFileName = "";
1129 string qualFileName = "";
1130 string nameFileName = "";
1132 if(primerName == ""){
1133 comboGroupName = barcodeNameVector[itBar->second];
1136 if(barcodeName == ""){
1137 comboGroupName = primerNameVector[itPrimer->second];
1140 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1146 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1147 if (uniqueNames.count(fastaFileName) == 0) {
1148 outputNames.push_back(fastaFileName);
1149 outputTypes["fasta"].push_back(fastaFileName);
1150 uniqueNames.insert(fastaFileName);
1153 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1154 m->openOutputFile(fastaFileName, temp); temp.close();
1156 if(qFileName != ""){
1157 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1158 if (uniqueNames.count(qualFileName) == 0) {
1159 outputNames.push_back(qualFileName);
1160 outputTypes["qfile"].push_back(qualFileName);
1163 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1164 m->openOutputFile(qualFileName, temp); temp.close();
1168 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1169 if (uniqueNames.count(nameFileName) == 0) {
1170 outputNames.push_back(nameFileName);
1171 outputTypes["name"].push_back(nameFileName);
1174 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1175 m->openOutputFile(nameFileName, temp); temp.close();
1181 numFPrimers = primers.size();
1182 numRPrimers = revPrimer.size();
1185 catch(exception& e) {
1186 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1191 //***************************************************************************************************************
1193 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1196 string rawSequence = seq.getUnaligned();
1197 int success = bdiffs + 1; //guilty until proven innocent
1199 //can you find the barcode
1200 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1201 string oligo = it->first;
1202 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1203 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1207 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1209 seq.setUnaligned(rawSequence.substr(oligo.length()));
1211 if(qual.getName() != ""){
1212 qual.trimQScores(oligo.length(), -1);
1220 //if you found the barcode or if you don't want to allow for diffs
1221 if ((bdiffs == 0) || (success == 0)) { return success; }
1223 else { //try aligning and see if you can find it
1227 Alignment* alignment;
1228 if (barcodes.size() > 0) {
1229 map<string,int>::iterator it=barcodes.begin();
1231 for(it;it!=barcodes.end();it++){
1232 if(it->first.length() > maxLength){
1233 maxLength = it->first.length();
1236 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1238 }else{ alignment = NULL; }
1240 //can you find the barcode
1246 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1247 string oligo = it->first;
1248 // int length = oligo.length();
1250 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1251 success = bdiffs + 10;
1255 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1256 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1257 oligo = alignment->getSeqAAln();
1258 string temp = alignment->getSeqBAln();
1260 int alnLength = oligo.length();
1262 for(int i=oligo.length()-1;i>=0;i--){
1263 if(oligo[i] != '-'){ alnLength = i+1; break; }
1265 oligo = oligo.substr(0,alnLength);
1266 temp = temp.substr(0,alnLength);
1268 int numDiff = countDiffs(oligo, temp);
1270 if(numDiff < minDiff){
1273 minGroup = it->second;
1275 for(int i=0;i<alnLength;i++){
1281 else if(numDiff == minDiff){
1287 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1288 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1289 else{ //use the best match
1291 seq.setUnaligned(rawSequence.substr(minPos));
1293 if(qual.getName() != ""){
1294 qual.trimQScores(minPos, -1);
1299 if (alignment != NULL) { delete alignment; }
1306 catch(exception& e) {
1307 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1313 //***************************************************************************************************************
1315 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1317 string rawSequence = seq.getUnaligned();
1318 int success = pdiffs + 1; //guilty until proven innocent
1320 //can you find the primer
1321 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1322 string oligo = it->first;
1323 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1324 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1328 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1330 seq.setUnaligned(rawSequence.substr(oligo.length()));
1331 if(qual.getName() != ""){
1332 qual.trimQScores(oligo.length(), -1);
1339 //if you found the barcode or if you don't want to allow for diffs
1340 if ((pdiffs == 0) || (success == 0)) { return success; }
1342 else { //try aligning and see if you can find it
1346 Alignment* alignment;
1347 if (primers.size() > 0) {
1348 map<string,int>::iterator it=primers.begin();
1350 for(it;it!=primers.end();it++){
1351 if(it->first.length() > maxLength){
1352 maxLength = it->first.length();
1355 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1357 }else{ alignment = NULL; }
1359 //can you find the barcode
1365 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1366 string oligo = it->first;
1367 // int length = oligo.length();
1369 if(rawSequence.length() < maxLength){
1370 success = pdiffs + 100;
1374 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1375 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1376 oligo = alignment->getSeqAAln();
1377 string temp = alignment->getSeqBAln();
1379 int alnLength = oligo.length();
1381 for(int i=oligo.length()-1;i>=0;i--){
1382 if(oligo[i] != '-'){ alnLength = i+1; break; }
1384 oligo = oligo.substr(0,alnLength);
1385 temp = temp.substr(0,alnLength);
1387 int numDiff = countDiffs(oligo, temp);
1389 if(numDiff < minDiff){
1392 minGroup = it->second;
1394 for(int i=0;i<alnLength;i++){
1400 else if(numDiff == minDiff){
1406 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1407 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1408 else{ //use the best match
1410 seq.setUnaligned(rawSequence.substr(minPos));
1411 if(qual.getName() != ""){
1412 qual.trimQScores(minPos, -1);
1417 if (alignment != NULL) { delete alignment; }
1424 catch(exception& e) {
1425 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1430 //***************************************************************************************************************
1432 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1434 string rawSequence = seq.getUnaligned();
1435 bool success = 0; //guilty until proven innocent
1437 for(int i=0;i<numRPrimers;i++){
1438 string oligo = revPrimer[i];
1440 if(rawSequence.length() < oligo.length()){
1445 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1446 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1447 if(qual.getName() != ""){
1448 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1457 catch(exception& e) {
1458 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1463 //***************************************************************************************************************
1465 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1468 if(qscores.getName() != ""){
1469 qscores.trimQScores(-1, keepFirst);
1471 sequence.trim(keepFirst);
1474 catch(exception& e) {
1475 m->errorOut(e, "keepFirstTrim", "countDiffs");
1481 //***************************************************************************************************************
1483 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1487 int length = sequence.getNumBases() - removeLast;
1490 if(qscores.getName() != ""){
1491 qscores.trimQScores(-1, length);
1493 sequence.trim(length);
1502 catch(exception& e) {
1503 m->errorOut(e, "removeLastTrim", "countDiffs");
1509 //***************************************************************************************************************
1511 bool TrimSeqsCommand::cullLength(Sequence& seq){
1514 int length = seq.getNumBases();
1515 bool success = 0; //guilty until proven innocent
1517 if(length >= minLength && maxLength == 0) { success = 1; }
1518 else if(length >= minLength && length <= maxLength) { success = 1; }
1519 else { success = 0; }
1524 catch(exception& e) {
1525 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1531 //***************************************************************************************************************
1533 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1535 int longHomoP = seq.getLongHomoPolymer();
1536 bool success = 0; //guilty until proven innocent
1538 if(longHomoP <= maxHomoP){ success = 1; }
1539 else { success = 0; }
1543 catch(exception& e) {
1544 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1550 //***************************************************************************************************************
1552 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1554 int numNs = seq.getAmbigBases();
1555 bool success = 0; //guilty until proven innocent
1557 if(numNs <= maxAmbig) { success = 1; }
1558 else { success = 0; }
1562 catch(exception& e) {
1563 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1569 //***************************************************************************************************************
1571 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1574 int length = oligo.length();
1576 for(int i=0;i<length;i++){
1578 if(oligo[i] != seq[i]){
1579 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1580 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1581 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1582 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1583 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1584 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1585 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1586 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1587 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1588 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1589 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1590 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1592 if(success == 0) { break; }
1601 catch(exception& e) {
1602 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1608 //***************************************************************************************************************
1610 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1613 int length = oligo.length();
1616 for(int i=0;i<length;i++){
1618 if(oligo[i] != seq[i]){
1619 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1620 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1621 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1622 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1623 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1624 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1625 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1626 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1627 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1628 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1629 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1630 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1637 catch(exception& e) {
1638 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1644 //***************************************************************************************************************