5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
20 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
21 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
22 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
23 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
24 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
25 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
26 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
27 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
30 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
31 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
32 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
33 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
34 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
35 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
36 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
37 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
38 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "TrimSeqsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string TrimSeqsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
56 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
57 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
58 helpString += "The fasta parameter is required.\n";
59 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
60 helpString += "The oligos parameter allows you to provide an oligos file.\n";
61 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
62 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
63 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
64 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
65 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
66 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
67 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
68 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
69 helpString += "The qfile parameter allows you to provide a quality file.\n";
70 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
71 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
72 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
73 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
74 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
75 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
76 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
77 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
78 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
79 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
80 helpString += "The trim.seqs command should be in the following format: \n";
81 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
82 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
83 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
84 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
85 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
89 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
95 //**********************************************************************************************************************
97 TrimSeqsCommand::TrimSeqsCommand(){
99 abort = true; calledHelp = true;
101 vector<string> tempOutNames;
102 outputTypes["fasta"] = tempOutNames;
103 outputTypes["qfile"] = tempOutNames;
104 outputTypes["group"] = tempOutNames;
105 outputTypes["name"] = tempOutNames;
107 catch(exception& e) {
108 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
112 //***************************************************************************************************************
114 TrimSeqsCommand::TrimSeqsCommand(string option) {
117 abort = false; calledHelp = false;
120 //allow user to run help
121 if(option == "help") { help(); abort = true; calledHelp = true; }
122 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
125 vector<string> myArray = setParameters();
127 OptionParser parser(option);
128 map<string,string> parameters = parser.getParameters();
130 ValidParameters validParameter;
131 map<string,string>::iterator it;
133 //check to make sure all parameters are valid for command
134 for (it = parameters.begin(); it != parameters.end(); it++) {
135 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
138 //initialize outputTypes
139 vector<string> tempOutNames;
140 outputTypes["fasta"] = tempOutNames;
141 outputTypes["qfile"] = tempOutNames;
142 outputTypes["group"] = tempOutNames;
143 outputTypes["name"] = tempOutNames;
145 //if the user changes the input directory command factory will send this info to us in the output parameter
146 string inputDir = validParameter.validFile(parameters, "inputdir", false);
147 if (inputDir == "not found"){ inputDir = ""; }
150 it = parameters.find("fasta");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["fasta"] = inputDir + it->second; }
158 it = parameters.find("oligos");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["oligos"] = inputDir + it->second; }
166 it = parameters.find("qfile");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["qfile"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
185 //check for required parameters
186 fastaFile = validParameter.validFile(parameters, "fasta", true);
187 if (fastaFile == "not found") {
188 fastaFile = m->getFastaFile();
189 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
190 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
191 }else if (fastaFile == "not open") { abort = true; }
192 else { m->setFastaFile(fastaFile); }
194 //if the user changes the output directory command factory will send this info to us in the output parameter
195 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
197 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
201 //check for optional parameter and set defaults
202 // ...at some point should added some additional type checking...
204 temp = validParameter.validFile(parameters, "flip", false);
205 if (temp == "not found"){ flip = 0; }
206 else if(m->isTrue(temp)) { flip = 1; }
208 temp = validParameter.validFile(parameters, "oligos", true);
209 if (temp == "not found"){ oligoFile = ""; }
210 else if(temp == "not open"){ abort = true; }
211 else { oligoFile = temp; m->setOligosFile(oligoFile); }
214 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
215 convert(temp, maxAmbig);
217 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
218 convert(temp, maxHomoP);
220 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
221 convert(temp, minLength);
223 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
224 convert(temp, maxLength);
226 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
227 convert(temp, bdiffs);
229 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
230 convert(temp, pdiffs);
232 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
233 convert(temp, tdiffs);
235 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
237 temp = validParameter.validFile(parameters, "qfile", true);
238 if (temp == "not found") { qFileName = ""; }
239 else if(temp == "not open") { abort = true; }
240 else { qFileName = temp; m->setQualFile(qFileName); }
242 temp = validParameter.validFile(parameters, "name", true);
243 if (temp == "not found") { nameFile = ""; }
244 else if(temp == "not open") { nameFile = ""; abort = true; }
245 else { nameFile = temp; m->setNameFile(nameFile); }
247 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
248 convert(temp, qThreshold);
250 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
251 qtrim = m->isTrue(temp);
253 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
254 convert(temp, qRollAverage);
256 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
257 convert(temp, qWindowAverage);
259 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
260 convert(temp, qWindowSize);
262 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
263 convert(temp, qWindowStep);
265 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
266 convert(temp, qAverage);
268 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
269 convert(temp, keepFirst);
271 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
272 convert(temp, removeLast);
274 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
275 allFiles = m->isTrue(temp);
277 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
278 m->setProcessors(temp);
279 convert(temp, processors);
282 if(allFiles && (oligoFile == "")){
283 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
285 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
286 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
290 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
291 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
297 catch(exception& e) {
298 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
302 //***************************************************************************************************************
304 int TrimSeqsCommand::execute(){
307 if (abort == true) { if (calledHelp) { return 0; } return 2; }
309 numFPrimers = 0; //this needs to be initialized
311 vector<vector<string> > fastaFileNames;
312 vector<vector<string> > qualFileNames;
313 vector<vector<string> > nameFileNames;
315 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
316 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
318 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
319 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
321 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
322 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
324 if (qFileName != "") {
325 outputNames.push_back(trimQualFile);
326 outputNames.push_back(scrapQualFile);
327 outputTypes["qfile"].push_back(trimQualFile);
328 outputTypes["qfile"].push_back(scrapQualFile);
331 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
332 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
334 if (nameFile != "") {
335 m->readNames(nameFile, nameMap);
336 outputNames.push_back(trimNameFile);
337 outputNames.push_back(scrapNameFile);
338 outputTypes["name"].push_back(trimNameFile);
339 outputTypes["name"].push_back(scrapNameFile);
342 if (m->control_pressed) { return 0; }
344 string outputGroupFileName;
346 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
347 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
348 getOligos(fastaFileNames, qualFileNames, nameFileNames);
351 vector<unsigned long int> fastaFilePos;
352 vector<unsigned long int> qFilePos;
354 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
356 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
357 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
358 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
360 if(qFileName == "") { qLines = lines; } //files with duds
362 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
364 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
366 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
369 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
372 if (m->control_pressed) { return 0; }
375 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
376 map<string, string>::iterator it;
377 set<string> namesToRemove;
378 for(int i=0;i<fastaFileNames.size();i++){
379 for(int j=0;j<fastaFileNames[0].size();j++){
380 if (fastaFileNames[i][j] != "") {
381 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
382 if(m->isBlank(fastaFileNames[i][j])){
383 m->mothurRemove(fastaFileNames[i][j]);
384 namesToRemove.insert(fastaFileNames[i][j]);
387 m->mothurRemove(qualFileNames[i][j]);
388 namesToRemove.insert(qualFileNames[i][j]);
392 m->mothurRemove(nameFileNames[i][j]);
393 namesToRemove.insert(nameFileNames[i][j]);
396 it = uniqueFastaNames.find(fastaFileNames[i][j]);
397 if (it == uniqueFastaNames.end()) {
398 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
406 //remove names for outputFileNames, just cleans up the output
407 vector<string> outputNames2;
408 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
409 outputNames = outputNames2;
411 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
413 m->openInputFile(it->first, in);
416 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
417 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
418 m->openOutputFile(thisGroupName, out);
421 if (m->control_pressed) { break; }
423 Sequence currSeq(in); m->gobble(in);
424 out << currSeq.getName() << '\t' << it->second << endl;
431 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
433 //output group counts
434 m->mothurOutEndLine();
436 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
437 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
438 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
440 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
442 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
444 //set fasta file as new current fastafile
446 itTypes = outputTypes.find("fasta");
447 if (itTypes != outputTypes.end()) {
448 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
451 itTypes = outputTypes.find("name");
452 if (itTypes != outputTypes.end()) {
453 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
456 itTypes = outputTypes.find("qfile");
457 if (itTypes != outputTypes.end()) {
458 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
461 itTypes = outputTypes.find("group");
462 if (itTypes != outputTypes.end()) {
463 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
466 m->mothurOutEndLine();
467 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
468 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
469 m->mothurOutEndLine();
474 catch(exception& e) {
475 m->errorOut(e, "TrimSeqsCommand", "execute");
480 /**************************************************************************************/
482 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
486 ofstream trimFASTAFile;
487 m->openOutputFile(trimFileName, trimFASTAFile);
489 ofstream scrapFASTAFile;
490 m->openOutputFile(scrapFileName, scrapFASTAFile);
492 ofstream trimQualFile;
493 ofstream scrapQualFile;
495 m->openOutputFile(trimQFileName, trimQualFile);
496 m->openOutputFile(scrapQFileName, scrapQualFile);
499 ofstream trimNameFile;
500 ofstream scrapNameFile;
502 m->openOutputFile(trimNFileName, trimNameFile);
503 m->openOutputFile(scrapNFileName, scrapNameFile);
507 ofstream outGroupsFile;
508 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
510 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
511 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
512 if (fastaFileNames[i][j] != "") {
514 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
516 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
520 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
528 m->openInputFile(filename, inFASTA);
529 inFASTA.seekg(line->start);
532 if(qFileName != "") {
533 m->openInputFile(qFileName, qFile);
534 qFile.seekg(qline->start);
542 if (m->control_pressed) {
543 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
544 if (oligoFile != "") { outGroupsFile.close(); }
549 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
555 string trashCode = "";
556 int currentSeqsDiffs = 0;
558 Sequence currSeq(inFASTA); m->gobble(inFASTA);
559 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
560 QualityScores currQual;
562 currQual = QualityScores(qFile); m->gobble(qFile);
565 string origSeq = currSeq.getUnaligned();
568 int barcodeIndex = 0;
571 if(barcodes.size() != 0){
572 success = stripBarcode(currSeq, currQual, barcodeIndex);
573 if(success > bdiffs) { trashCode += 'b'; }
574 else{ currentSeqsDiffs += success; }
577 if(numFPrimers != 0){
578 success = stripForward(currSeq, currQual, primerIndex);
579 if(success > pdiffs) { trashCode += 'f'; }
580 else{ currentSeqsDiffs += success; }
583 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
585 if(numRPrimers != 0){
586 success = stripReverse(currSeq, currQual);
587 if(!success) { trashCode += 'r'; }
591 success = keepFirstTrim(currSeq, currQual);
595 success = removeLastTrim(currSeq, currQual);
596 if(!success) { trashCode += 'l'; }
601 int origLength = currSeq.getNumBases();
603 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
604 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
605 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
606 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
607 else { success = 1; }
609 //you don't want to trim, if it fails above then scrap it
610 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
612 if(!success) { trashCode += 'q'; }
615 if(minLength > 0 || maxLength > 0){
616 success = cullLength(currSeq);
617 if(!success) { trashCode += 'l'; }
620 success = cullHomoP(currSeq);
621 if(!success) { trashCode += 'h'; }
624 success = cullAmbigs(currSeq);
625 if(!success) { trashCode += 'n'; }
628 if(flip){ // should go last
629 currSeq.reverseComplement();
631 currQual.flipQScores();
635 if(trashCode.length() == 0){
636 currSeq.setAligned(currSeq.getUnaligned());
637 currSeq.printSequence(trimFASTAFile);
640 currQual.printQScores(trimQualFile);
644 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
645 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
646 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
649 if(barcodes.size() != 0){
650 string thisGroup = barcodeNameVector[barcodeIndex];
651 if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
653 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
655 if (nameFile != "") {
656 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
657 if (itName != nameMap.end()) {
658 vector<string> thisSeqsNames;
659 m->splitAtChar(itName->second, thisSeqsNames, ',');
660 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
661 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
663 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
666 map<string, int>::iterator it = groupCounts.find(thisGroup);
667 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
668 else { groupCounts[it->first]++; }
675 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
676 currSeq.printSequence(output);
680 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
681 currQual.printQScores(output);
686 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
687 if (itName != nameMap.end()) {
688 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
689 output << itName->first << '\t' << itName->second << endl;
691 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
696 if(nameFile != ""){ //needs to be before the currSeq name is changed
697 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
698 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
699 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
701 currSeq.setName(currSeq.getName() + '|' + trashCode);
702 currSeq.setUnaligned(origSeq);
703 currSeq.setAligned(origSeq);
704 currSeq.printSequence(scrapFASTAFile);
706 currQual.printQScores(scrapQualFile);
712 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
713 unsigned long int pos = inFASTA.tellg();
714 if ((pos == -1) || (pos >= line->end)) { break; }
717 if (inFASTA.eof()) { break; }
721 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
725 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
729 trimFASTAFile.close();
730 scrapFASTAFile.close();
731 if (oligoFile != "") { outGroupsFile.close(); }
732 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
733 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
737 catch(exception& e) {
738 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
743 /**************************************************************************************************/
745 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
747 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
752 //loop through and create all the processes you want
753 while (process != processors) {
757 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
761 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
762 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
763 vector<vector<string> > tempNameFileNames = nameFileNames;
768 for(int i=0;i<tempFASTAFileNames.size();i++){
769 for(int j=0;j<tempFASTAFileNames[i].size();j++){
770 if (tempFASTAFileNames[i][j] != "") {
771 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
772 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
775 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
776 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
779 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
780 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
787 driverCreateTrim(filename,
789 (trimFASTAFileName + toString(getpid()) + ".temp"),
790 (scrapFASTAFileName + toString(getpid()) + ".temp"),
791 (trimQualFileName + toString(getpid()) + ".temp"),
792 (scrapQualFileName + toString(getpid()) + ".temp"),
793 (trimNameFileName + toString(getpid()) + ".temp"),
794 (scrapNameFileName + toString(getpid()) + ".temp"),
795 (groupFile + toString(getpid()) + ".temp"),
797 tempPrimerQualFileNames,
802 //pass groupCounts to parent
805 string tempFile = filename + toString(getpid()) + ".num.temp";
806 m->openOutputFile(tempFile, out);
808 out << groupCounts.size() << endl;
810 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
811 out << it->first << '\t' << it->second << endl;
817 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
818 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
825 m->openOutputFile(trimFASTAFileName, temp); temp.close();
826 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
828 m->openOutputFile(trimQualFileName, temp); temp.close();
829 m->openOutputFile(scrapQualFileName, temp); temp.close();
831 if (nameFile != "") {
832 m->openOutputFile(trimNameFileName, temp); temp.close();
833 m->openOutputFile(scrapNameFileName, temp); temp.close();
836 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
838 //force parent to wait until all the processes are done
839 for (int i=0;i<processIDS.size();i++) {
840 int temp = processIDS[i];
845 for(int i=0;i<processIDS.size();i++){
847 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
849 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
850 m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp"));
851 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
852 m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp"));
855 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
856 m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp"));
857 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
858 m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp"));
862 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
863 m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp"));
864 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
865 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
869 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
870 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
875 for(int j=0;j<fastaFileNames.size();j++){
876 for(int k=0;k<fastaFileNames[j].size();k++){
877 if (fastaFileNames[j][k] != "") {
878 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
879 m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"));
882 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
883 m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"));
887 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
888 m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"));
897 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
898 m->openInputFile(tempFile, in);
902 in >> tempNum; m->gobble(in);
906 in >> group >> tempNum; m->gobble(in);
908 map<string, int>::iterator it = groupCounts.find(group);
909 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
910 else { groupCounts[it->first] += tempNum; }
913 in.close(); m->mothurRemove(tempFile);
921 catch(exception& e) {
922 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
927 /**************************************************************************************************/
929 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
931 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
932 //set file positions for fasta file
933 fastaFilePos = m->divideFile(filename, processors);
935 if (qfilename == "") { return processors; }
937 //get name of first sequence in each chunk
938 map<string, int> firstSeqNames;
939 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
941 m->openInputFile(filename, in);
942 in.seekg(fastaFilePos[i]);
945 firstSeqNames[temp.getName()] = i;
950 //seach for filePos of each first name in the qfile and save in qfileFilePos
952 m->openInputFile(qfilename, inQual);
955 while(!inQual.eof()){
956 input = m->getline(inQual);
958 if (input.length() != 0) {
959 if(input[0] == '>'){ //this is a sequence name line
960 istringstream nameStream(input);
962 string sname = ""; nameStream >> sname;
963 sname = sname.substr(1);
965 map<string, int>::iterator it = firstSeqNames.find(sname);
967 if(it != firstSeqNames.end()) { //this is the start of a new chunk
968 unsigned long int pos = inQual.tellg();
969 qfileFilePos.push_back(pos - input.length() - 1);
970 firstSeqNames.erase(it);
975 if (firstSeqNames.size() == 0) { break; }
980 if (firstSeqNames.size() != 0) {
981 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
982 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
988 //get last file position of qfile
990 unsigned long int size;
992 //get num bytes in file
993 pFile = fopen (qfilename.c_str(),"rb");
994 if (pFile==NULL) perror ("Error opening file");
996 fseek (pFile, 0, SEEK_END);
1001 qfileFilePos.push_back(size);
1007 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1008 //get last file position of fastafile
1010 unsigned long int size;
1012 //get num bytes in file
1013 pFile = fopen (filename.c_str(),"rb");
1014 if (pFile==NULL) perror ("Error opening file");
1016 fseek (pFile, 0, SEEK_END);
1020 fastaFilePos.push_back(size);
1022 //get last file position of fastafile
1025 //get num bytes in file
1026 qFile = fopen (qfilename.c_str(),"rb");
1027 if (qFile==NULL) perror ("Error opening file");
1029 fseek (qFile, 0, SEEK_END);
1033 qfileFilePos.push_back(size);
1039 catch(exception& e) {
1040 m->errorOut(e, "TrimSeqsCommand", "setLines");
1045 //***************************************************************************************************************
1047 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1050 m->openInputFile(oligoFile, inOligos);
1054 string type, oligo, group;
1056 int indexPrimer = 0;
1057 int indexBarcode = 0;
1059 while(!inOligos.eof()){
1064 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1065 m->gobble(inOligos);
1068 m->gobble(inOligos);
1069 //make type case insensitive
1070 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1074 for(int i=0;i<oligo.length();i++){
1075 oligo[i] = toupper(oligo[i]);
1076 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1079 if(type == "FORWARD"){
1082 // get rest of line in case there is a primer name
1083 while (!inOligos.eof()) {
1084 char c = inOligos.get();
1085 if (c == 10 || c == 13){ break; }
1086 else if (c == 32 || c == 9){;} //space or tab
1087 else { group += c; }
1090 //check for repeat barcodes
1091 map<string, int>::iterator itPrime = primers.find(oligo);
1092 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1094 primers[oligo]=indexPrimer; indexPrimer++;
1095 primerNameVector.push_back(group);
1097 else if(type == "REVERSE"){
1098 Sequence oligoRC("reverse", oligo);
1099 oligoRC.reverseComplement();
1100 revPrimer.push_back(oligoRC.getUnaligned());
1102 else if(type == "BARCODE"){
1105 //check for repeat barcodes
1106 map<string, int>::iterator itBar = barcodes.find(oligo);
1107 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1109 barcodes[oligo]=indexBarcode; indexBarcode++;
1110 barcodeNameVector.push_back(group);
1112 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1114 m->gobble(inOligos);
1118 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1120 //add in potential combos
1121 if(barcodeNameVector.size() == 0){
1123 barcodeNameVector.push_back("");
1126 if(primerNameVector.size() == 0){
1128 primerNameVector.push_back("");
1131 fastaFileNames.resize(barcodeNameVector.size());
1132 for(int i=0;i<fastaFileNames.size();i++){
1133 fastaFileNames[i].assign(primerNameVector.size(), "");
1135 if(qFileName != "") { qualFileNames = fastaFileNames; }
1136 if(nameFile != "") { nameFileNames = fastaFileNames; }
1139 set<string> uniqueNames; //used to cleanup outputFileNames
1140 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1141 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1143 string primerName = primerNameVector[itPrimer->second];
1144 string barcodeName = barcodeNameVector[itBar->second];
1146 string comboGroupName = "";
1147 string fastaFileName = "";
1148 string qualFileName = "";
1149 string nameFileName = "";
1151 if(primerName == ""){
1152 comboGroupName = barcodeNameVector[itBar->second];
1155 if(barcodeName == ""){
1156 comboGroupName = primerNameVector[itPrimer->second];
1159 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1165 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1166 if (uniqueNames.count(fastaFileName) == 0) {
1167 outputNames.push_back(fastaFileName);
1168 outputTypes["fasta"].push_back(fastaFileName);
1169 uniqueNames.insert(fastaFileName);
1172 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1173 m->openOutputFile(fastaFileName, temp); temp.close();
1175 if(qFileName != ""){
1176 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1177 if (uniqueNames.count(qualFileName) == 0) {
1178 outputNames.push_back(qualFileName);
1179 outputTypes["qfile"].push_back(qualFileName);
1182 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1183 m->openOutputFile(qualFileName, temp); temp.close();
1187 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1188 if (uniqueNames.count(nameFileName) == 0) {
1189 outputNames.push_back(nameFileName);
1190 outputTypes["name"].push_back(nameFileName);
1193 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1194 m->openOutputFile(nameFileName, temp); temp.close();
1200 numFPrimers = primers.size();
1201 numRPrimers = revPrimer.size();
1204 catch(exception& e) {
1205 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1210 //***************************************************************************************************************
1212 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1215 string rawSequence = seq.getUnaligned();
1216 int success = bdiffs + 1; //guilty until proven innocent
1218 //can you find the barcode
1219 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1220 string oligo = it->first;
1221 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1222 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1226 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1228 seq.setUnaligned(rawSequence.substr(oligo.length()));
1230 if(qual.getName() != ""){
1231 qual.trimQScores(oligo.length(), -1);
1239 //if you found the barcode or if you don't want to allow for diffs
1240 if ((bdiffs == 0) || (success == 0)) { return success; }
1242 else { //try aligning and see if you can find it
1246 Alignment* alignment;
1247 if (barcodes.size() > 0) {
1248 map<string,int>::iterator it=barcodes.begin();
1250 for(it;it!=barcodes.end();it++){
1251 if(it->first.length() > maxLength){
1252 maxLength = it->first.length();
1255 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1257 }else{ alignment = NULL; }
1259 //can you find the barcode
1265 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1266 string oligo = it->first;
1267 // int length = oligo.length();
1269 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1270 success = bdiffs + 10;
1274 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1275 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1276 oligo = alignment->getSeqAAln();
1277 string temp = alignment->getSeqBAln();
1279 int alnLength = oligo.length();
1281 for(int i=oligo.length()-1;i>=0;i--){
1282 if(oligo[i] != '-'){ alnLength = i+1; break; }
1284 oligo = oligo.substr(0,alnLength);
1285 temp = temp.substr(0,alnLength);
1287 int numDiff = countDiffs(oligo, temp);
1289 if(numDiff < minDiff){
1292 minGroup = it->second;
1294 for(int i=0;i<alnLength;i++){
1300 else if(numDiff == minDiff){
1306 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1307 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1308 else{ //use the best match
1310 seq.setUnaligned(rawSequence.substr(minPos));
1312 if(qual.getName() != ""){
1313 qual.trimQScores(minPos, -1);
1318 if (alignment != NULL) { delete alignment; }
1325 catch(exception& e) {
1326 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1332 //***************************************************************************************************************
1334 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1336 string rawSequence = seq.getUnaligned();
1337 int success = pdiffs + 1; //guilty until proven innocent
1339 //can you find the primer
1340 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1341 string oligo = it->first;
1342 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1343 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1347 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1349 seq.setUnaligned(rawSequence.substr(oligo.length()));
1350 if(qual.getName() != ""){
1351 qual.trimQScores(oligo.length(), -1);
1358 //if you found the barcode or if you don't want to allow for diffs
1359 if ((pdiffs == 0) || (success == 0)) { return success; }
1361 else { //try aligning and see if you can find it
1365 Alignment* alignment;
1366 if (primers.size() > 0) {
1367 map<string,int>::iterator it=primers.begin();
1369 for(it;it!=primers.end();it++){
1370 if(it->first.length() > maxLength){
1371 maxLength = it->first.length();
1374 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1376 }else{ alignment = NULL; }
1378 //can you find the barcode
1384 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1385 string oligo = it->first;
1386 // int length = oligo.length();
1388 if(rawSequence.length() < maxLength){
1389 success = pdiffs + 100;
1393 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1394 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1395 oligo = alignment->getSeqAAln();
1396 string temp = alignment->getSeqBAln();
1398 int alnLength = oligo.length();
1400 for(int i=oligo.length()-1;i>=0;i--){
1401 if(oligo[i] != '-'){ alnLength = i+1; break; }
1403 oligo = oligo.substr(0,alnLength);
1404 temp = temp.substr(0,alnLength);
1406 int numDiff = countDiffs(oligo, temp);
1408 if(numDiff < minDiff){
1411 minGroup = it->second;
1413 for(int i=0;i<alnLength;i++){
1419 else if(numDiff == minDiff){
1425 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1426 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1427 else{ //use the best match
1429 seq.setUnaligned(rawSequence.substr(minPos));
1430 if(qual.getName() != ""){
1431 qual.trimQScores(minPos, -1);
1436 if (alignment != NULL) { delete alignment; }
1443 catch(exception& e) {
1444 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1449 //***************************************************************************************************************
1451 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1453 string rawSequence = seq.getUnaligned();
1454 bool success = 0; //guilty until proven innocent
1456 for(int i=0;i<numRPrimers;i++){
1457 string oligo = revPrimer[i];
1459 if(rawSequence.length() < oligo.length()){
1464 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1465 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1466 if(qual.getName() != ""){
1467 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1476 catch(exception& e) {
1477 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1482 //***************************************************************************************************************
1484 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1487 if(qscores.getName() != ""){
1488 qscores.trimQScores(-1, keepFirst);
1490 sequence.trim(keepFirst);
1493 catch(exception& e) {
1494 m->errorOut(e, "keepFirstTrim", "countDiffs");
1500 //***************************************************************************************************************
1502 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1506 int length = sequence.getNumBases() - removeLast;
1509 if(qscores.getName() != ""){
1510 qscores.trimQScores(-1, length);
1512 sequence.trim(length);
1521 catch(exception& e) {
1522 m->errorOut(e, "removeLastTrim", "countDiffs");
1528 //***************************************************************************************************************
1530 bool TrimSeqsCommand::cullLength(Sequence& seq){
1533 int length = seq.getNumBases();
1534 bool success = 0; //guilty until proven innocent
1536 if(length >= minLength && maxLength == 0) { success = 1; }
1537 else if(length >= minLength && length <= maxLength) { success = 1; }
1538 else { success = 0; }
1543 catch(exception& e) {
1544 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1550 //***************************************************************************************************************
1552 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1554 int longHomoP = seq.getLongHomoPolymer();
1555 bool success = 0; //guilty until proven innocent
1557 if(longHomoP <= maxHomoP){ success = 1; }
1558 else { success = 0; }
1562 catch(exception& e) {
1563 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1569 //***************************************************************************************************************
1571 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1573 int numNs = seq.getAmbigBases();
1574 bool success = 0; //guilty until proven innocent
1576 if(numNs <= maxAmbig) { success = 1; }
1577 else { success = 0; }
1581 catch(exception& e) {
1582 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1588 //***************************************************************************************************************
1590 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1593 int length = oligo.length();
1595 for(int i=0;i<length;i++){
1597 if(oligo[i] != seq[i]){
1598 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1599 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1600 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1601 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1602 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1603 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1604 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1605 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1606 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1607 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1608 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1609 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1611 if(success == 0) { break; }
1620 catch(exception& e) {
1621 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1627 //***************************************************************************************************************
1629 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1632 int length = oligo.length();
1635 for(int i=0;i<length;i++){
1637 if(oligo[i] != seq[i]){
1638 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1639 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1640 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1641 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1642 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1643 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1644 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1645 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1646 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1647 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1648 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1649 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1656 catch(exception& e) {
1657 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1663 //***************************************************************************************************************