5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
20 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
21 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
22 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
23 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
24 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
25 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
26 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
27 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
30 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
31 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
32 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
33 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
34 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
35 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
36 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
37 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
38 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "TrimSeqsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string TrimSeqsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
56 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
57 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
58 helpString += "The fasta parameter is required.\n";
59 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
60 helpString += "The oligos parameter allows you to provide an oligos file.\n";
61 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
62 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
63 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
64 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
65 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
66 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
67 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
68 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
69 helpString += "The qfile parameter allows you to provide a quality file.\n";
70 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
71 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
72 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
73 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
74 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
75 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
76 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
77 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
78 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
79 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
80 helpString += "The trim.seqs command should be in the following format: \n";
81 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
82 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
83 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
84 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
85 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
89 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
95 //**********************************************************************************************************************
97 TrimSeqsCommand::TrimSeqsCommand(){
99 abort = true; calledHelp = true;
101 vector<string> tempOutNames;
102 outputTypes["fasta"] = tempOutNames;
103 outputTypes["qfile"] = tempOutNames;
104 outputTypes["group"] = tempOutNames;
105 outputTypes["name"] = tempOutNames;
107 catch(exception& e) {
108 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
112 //***************************************************************************************************************
114 TrimSeqsCommand::TrimSeqsCommand(string option) {
117 abort = false; calledHelp = false;
120 //allow user to run help
121 if(option == "help") { help(); abort = true; calledHelp = true; }
122 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
125 vector<string> myArray = setParameters();
127 OptionParser parser(option);
128 map<string,string> parameters = parser.getParameters();
130 ValidParameters validParameter;
131 map<string,string>::iterator it;
133 //check to make sure all parameters are valid for command
134 for (it = parameters.begin(); it != parameters.end(); it++) {
135 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
138 //initialize outputTypes
139 vector<string> tempOutNames;
140 outputTypes["fasta"] = tempOutNames;
141 outputTypes["qfile"] = tempOutNames;
142 outputTypes["group"] = tempOutNames;
143 outputTypes["name"] = tempOutNames;
145 //if the user changes the input directory command factory will send this info to us in the output parameter
146 string inputDir = validParameter.validFile(parameters, "inputdir", false);
147 if (inputDir == "not found"){ inputDir = ""; }
150 it = parameters.find("fasta");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["fasta"] = inputDir + it->second; }
158 it = parameters.find("oligos");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["oligos"] = inputDir + it->second; }
166 it = parameters.find("qfile");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["qfile"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
185 //check for required parameters
186 fastaFile = validParameter.validFile(parameters, "fasta", true);
187 if (fastaFile == "not found") {
188 fastaFile = m->getFastaFile();
189 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
190 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
191 }else if (fastaFile == "not open") { abort = true; }
192 else { m->setFastaFile(fastaFile); }
194 //if the user changes the output directory command factory will send this info to us in the output parameter
195 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
197 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
201 //check for optional parameter and set defaults
202 // ...at some point should added some additional type checking...
204 temp = validParameter.validFile(parameters, "flip", false);
205 if (temp == "not found"){ flip = 0; }
206 else if(m->isTrue(temp)) { flip = 1; }
208 temp = validParameter.validFile(parameters, "oligos", true);
209 if (temp == "not found"){ oligoFile = ""; }
210 else if(temp == "not open"){ abort = true; }
211 else { oligoFile = temp; m->setOligosFile(oligoFile); }
214 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
215 convert(temp, maxAmbig);
217 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
218 convert(temp, maxHomoP);
220 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
221 convert(temp, minLength);
223 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
224 convert(temp, maxLength);
226 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
227 convert(temp, bdiffs);
229 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
230 convert(temp, pdiffs);
232 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
233 convert(temp, tdiffs);
235 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
237 temp = validParameter.validFile(parameters, "qfile", true);
238 if (temp == "not found") { qFileName = ""; }
239 else if(temp == "not open") { abort = true; }
240 else { qFileName = temp; m->setQualFile(qFileName); }
242 temp = validParameter.validFile(parameters, "name", true);
243 if (temp == "not found") { nameFile = ""; }
244 else if(temp == "not open") { nameFile = ""; abort = true; }
245 else { nameFile = temp; m->setNameFile(nameFile); }
247 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
248 convert(temp, qThreshold);
250 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
251 qtrim = m->isTrue(temp);
253 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
254 convert(temp, qRollAverage);
256 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
257 convert(temp, qWindowAverage);
259 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
260 convert(temp, qWindowSize);
262 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
263 convert(temp, qWindowStep);
265 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
266 convert(temp, qAverage);
268 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
269 convert(temp, keepFirst);
271 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
272 convert(temp, removeLast);
274 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
275 allFiles = m->isTrue(temp);
277 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
278 m->setProcessors(temp);
279 convert(temp, processors);
282 if(allFiles && (oligoFile == "")){
283 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
285 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
286 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
290 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
291 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
297 catch(exception& e) {
298 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
302 //***************************************************************************************************************
304 int TrimSeqsCommand::execute(){
307 if (abort == true) { if (calledHelp) { return 0; } return 2; }
309 numFPrimers = 0; //this needs to be initialized
311 vector<vector<string> > fastaFileNames;
312 vector<vector<string> > qualFileNames;
313 vector<vector<string> > nameFileNames;
315 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
316 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
318 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
319 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
321 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
322 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
324 if (qFileName != "") {
325 outputNames.push_back(trimQualFile);
326 outputNames.push_back(scrapQualFile);
327 outputTypes["qfile"].push_back(trimQualFile);
328 outputTypes["qfile"].push_back(scrapQualFile);
331 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
332 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
334 if (nameFile != "") {
335 m->readNames(nameFile, nameMap);
336 outputNames.push_back(trimNameFile);
337 outputNames.push_back(scrapNameFile);
338 outputTypes["name"].push_back(trimNameFile);
339 outputTypes["name"].push_back(scrapNameFile);
342 if (m->control_pressed) { return 0; }
344 string outputGroupFileName;
346 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
347 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
348 getOligos(fastaFileNames, qualFileNames, nameFileNames);
351 vector<unsigned long int> fastaFilePos;
352 vector<unsigned long int> qFilePos;
354 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
356 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
357 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
358 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
360 if(qFileName == "") { qLines = lines; } //files with duds
362 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
364 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
366 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
369 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
372 if (m->control_pressed) { return 0; }
375 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
376 map<string, string>::iterator it;
377 set<string> namesToRemove;
378 for(int i=0;i<fastaFileNames.size();i++){
379 for(int j=0;j<fastaFileNames[0].size();j++){
380 if (fastaFileNames[i][j] != "") {
381 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
382 if(m->isBlank(fastaFileNames[i][j])){
383 remove(fastaFileNames[i][j].c_str());
384 namesToRemove.insert(fastaFileNames[i][j]);
387 remove(qualFileNames[i][j].c_str());
388 namesToRemove.insert(qualFileNames[i][j]);
392 remove(nameFileNames[i][j].c_str());
393 namesToRemove.insert(nameFileNames[i][j]);
396 it = uniqueFastaNames.find(fastaFileNames[i][j]);
397 if (it == uniqueFastaNames.end()) {
398 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
406 //remove names for outputFileNames, just cleans up the output
407 vector<string> outputNames2;
408 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
409 outputNames = outputNames2;
411 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
413 m->openInputFile(it->first, in);
416 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
417 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
418 m->openOutputFile(thisGroupName, out);
421 if (m->control_pressed) { break; }
423 Sequence currSeq(in); m->gobble(in);
424 out << currSeq.getName() << '\t' << it->second << endl;
431 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
433 //output group counts
434 m->mothurOutEndLine();
436 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
437 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
438 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
440 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
442 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
444 //set fasta file as new current fastafile
446 itTypes = outputTypes.find("fasta");
447 if (itTypes != outputTypes.end()) {
448 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
451 itTypes = outputTypes.find("name");
452 if (itTypes != outputTypes.end()) {
453 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
456 itTypes = outputTypes.find("qfile");
457 if (itTypes != outputTypes.end()) {
458 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
461 itTypes = outputTypes.find("group");
462 if (itTypes != outputTypes.end()) {
463 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
466 m->mothurOutEndLine();
467 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
468 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
469 m->mothurOutEndLine();
474 catch(exception& e) {
475 m->errorOut(e, "TrimSeqsCommand", "execute");
480 /**************************************************************************************/
482 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
486 ofstream trimFASTAFile;
487 m->openOutputFile(trimFileName, trimFASTAFile);
489 ofstream scrapFASTAFile;
490 m->openOutputFile(scrapFileName, scrapFASTAFile);
492 ofstream trimQualFile;
493 ofstream scrapQualFile;
495 m->openOutputFile(trimQFileName, trimQualFile);
496 m->openOutputFile(scrapQFileName, scrapQualFile);
499 ofstream trimNameFile;
500 ofstream scrapNameFile;
502 m->openOutputFile(trimNFileName, trimNameFile);
503 m->openOutputFile(scrapNFileName, scrapNameFile);
507 ofstream outGroupsFile;
508 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
510 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
511 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
512 if (fastaFileNames[i][j] != "") {
514 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
516 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
520 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
528 m->openInputFile(filename, inFASTA);
529 inFASTA.seekg(line->start);
532 if(qFileName != "") {
533 m->openInputFile(qFileName, qFile);
534 qFile.seekg(qline->start);
542 if (m->control_pressed) {
543 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
544 if (oligoFile != "") { outGroupsFile.close(); }
549 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
555 string trashCode = "";
556 int currentSeqsDiffs = 0;
558 Sequence currSeq(inFASTA); m->gobble(inFASTA);
559 QualityScores currQual;
561 currQual = QualityScores(qFile); m->gobble(qFile);
563 //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
564 string origSeq = currSeq.getUnaligned();
567 int barcodeIndex = 0;
570 if(barcodes.size() != 0){
571 success = stripBarcode(currSeq, currQual, barcodeIndex);
572 if(success > bdiffs) { trashCode += 'b'; }
573 else{ currentSeqsDiffs += success; }
576 if(numFPrimers != 0){
577 success = stripForward(currSeq, currQual, primerIndex);
578 if(success > pdiffs) { trashCode += 'f'; }
579 else{ currentSeqsDiffs += success; }
582 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
584 if(numRPrimers != 0){
585 success = stripReverse(currSeq, currQual);
586 if(!success) { trashCode += 'r'; }
590 success = keepFirstTrim(currSeq, currQual);
594 success = removeLastTrim(currSeq, currQual);
595 if(!success) { trashCode += 'l'; }
600 int origLength = currSeq.getNumBases();
602 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
603 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
604 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
605 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
606 else { success = 1; }
608 //you don't want to trim, if it fails above then scrap it
609 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
611 if(!success) { trashCode += 'q'; }
614 if(minLength > 0 || maxLength > 0){
615 success = cullLength(currSeq);
616 if(!success) { trashCode += 'l'; }
619 success = cullHomoP(currSeq);
620 if(!success) { trashCode += 'h'; }
623 success = cullAmbigs(currSeq);
624 if(!success) { trashCode += 'n'; }
627 if(flip){ // should go last
628 currSeq.reverseComplement();
630 currQual.flipQScores();
634 if(trashCode.length() == 0){
635 currSeq.setAligned(currSeq.getUnaligned());
636 currSeq.printSequence(trimFASTAFile);
639 currQual.printQScores(trimQualFile);
643 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
644 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
645 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
648 if(barcodes.size() != 0){
649 string thisGroup = barcodeNameVector[barcodeIndex];
650 if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
652 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
654 if (nameFile != "") {
655 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
656 if (itName != nameMap.end()) {
657 vector<string> thisSeqsNames;
658 m->splitAtChar(itName->second, thisSeqsNames, ',');
659 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
660 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
662 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
665 map<string, int>::iterator it = groupCounts.find(thisGroup);
666 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
667 else { groupCounts[it->first]++; }
674 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
675 currSeq.printSequence(output);
679 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
680 currQual.printQScores(output);
685 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
686 if (itName != nameMap.end()) {
687 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
688 output << itName->first << '\t' << itName->second << endl;
690 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
695 if(nameFile != ""){ //needs to be before the currSeq name is changed
696 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
697 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
698 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
700 currSeq.setName(currSeq.getName() + '|' + trashCode);
701 currSeq.setUnaligned(origSeq);
702 currSeq.setAligned(origSeq);
703 currSeq.printSequence(scrapFASTAFile);
705 currQual.printQScores(scrapQualFile);
711 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
712 unsigned long int pos = inFASTA.tellg();
713 if ((pos == -1) || (pos >= line->end)) { break; }
716 if (inFASTA.eof()) { break; }
720 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
724 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
728 trimFASTAFile.close();
729 scrapFASTAFile.close();
730 if (oligoFile != "") { outGroupsFile.close(); }
731 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
732 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
736 catch(exception& e) {
737 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
742 /**************************************************************************************************/
744 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
746 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
751 //loop through and create all the processes you want
752 while (process != processors) {
756 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
760 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
761 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
762 vector<vector<string> > tempNameFileNames = nameFileNames;
767 for(int i=0;i<tempFASTAFileNames.size();i++){
768 for(int j=0;j<tempFASTAFileNames[i].size();j++){
769 if (tempFASTAFileNames[i][j] != "") {
770 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
771 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
774 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
775 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
778 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
779 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
786 driverCreateTrim(filename,
788 (trimFASTAFileName + toString(getpid()) + ".temp"),
789 (scrapFASTAFileName + toString(getpid()) + ".temp"),
790 (trimQualFileName + toString(getpid()) + ".temp"),
791 (scrapQualFileName + toString(getpid()) + ".temp"),
792 (trimNameFileName + toString(getpid()) + ".temp"),
793 (scrapNameFileName + toString(getpid()) + ".temp"),
794 (groupFile + toString(getpid()) + ".temp"),
796 tempPrimerQualFileNames,
801 //pass groupCounts to parent
804 string tempFile = filename + toString(getpid()) + ".num.temp";
805 m->openOutputFile(tempFile, out);
807 out << groupCounts.size() << endl;
809 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
810 out << it->first << '\t' << it->second << endl;
816 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
817 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
824 m->openOutputFile(trimFASTAFileName, temp); temp.close();
825 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
827 m->openOutputFile(trimQualFileName, temp); temp.close();
828 m->openOutputFile(scrapQualFileName, temp); temp.close();
830 if (nameFile != "") {
831 m->openOutputFile(trimNameFileName, temp); temp.close();
832 m->openOutputFile(scrapNameFileName, temp); temp.close();
835 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
837 //force parent to wait until all the processes are done
838 for (int i=0;i<processIDS.size();i++) {
839 int temp = processIDS[i];
844 for(int i=0;i<processIDS.size();i++){
846 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
848 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
849 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
850 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
851 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
854 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
855 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
856 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
857 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
861 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
862 remove((trimNameFileName + toString(processIDS[i]) + ".temp").c_str());
863 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
864 remove((scrapNameFileName + toString(processIDS[i]) + ".temp").c_str());
868 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
869 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
874 for(int j=0;j<fastaFileNames.size();j++){
875 for(int k=0;k<fastaFileNames[j].size();k++){
876 if (fastaFileNames[j][k] != "") {
877 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
878 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
881 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
882 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
886 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
887 remove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
896 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
897 m->openInputFile(tempFile, in);
901 in >> tempNum; m->gobble(in);
905 in >> group >> tempNum; m->gobble(in);
907 map<string, int>::iterator it = groupCounts.find(group);
908 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
909 else { groupCounts[it->first] += tempNum; }
912 in.close(); remove(tempFile.c_str());
920 catch(exception& e) {
921 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
926 /**************************************************************************************************/
928 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
930 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
931 //set file positions for fasta file
932 fastaFilePos = m->divideFile(filename, processors);
934 if (qfilename == "") { return processors; }
936 //get name of first sequence in each chunk
937 map<string, int> firstSeqNames;
938 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
940 m->openInputFile(filename, in);
941 in.seekg(fastaFilePos[i]);
944 firstSeqNames[temp.getName()] = i;
949 //seach for filePos of each first name in the qfile and save in qfileFilePos
951 m->openInputFile(qfilename, inQual);
954 while(!inQual.eof()){
955 input = m->getline(inQual);
957 if (input.length() != 0) {
958 if(input[0] == '>'){ //this is a sequence name line
959 istringstream nameStream(input);
961 string sname = ""; nameStream >> sname;
962 sname = sname.substr(1);
964 map<string, int>::iterator it = firstSeqNames.find(sname);
966 if(it != firstSeqNames.end()) { //this is the start of a new chunk
967 unsigned long int pos = inQual.tellg();
968 qfileFilePos.push_back(pos - input.length() - 1);
969 firstSeqNames.erase(it);
974 if (firstSeqNames.size() == 0) { break; }
979 if (firstSeqNames.size() != 0) {
980 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
981 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
987 //get last file position of qfile
989 unsigned long int size;
991 //get num bytes in file
992 pFile = fopen (qfilename.c_str(),"rb");
993 if (pFile==NULL) perror ("Error opening file");
995 fseek (pFile, 0, SEEK_END);
1000 qfileFilePos.push_back(size);
1006 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
1007 //get last file position of fastafile
1009 unsigned long int size;
1011 //get num bytes in file
1012 pFile = fopen (filename.c_str(),"rb");
1013 if (pFile==NULL) perror ("Error opening file");
1015 fseek (pFile, 0, SEEK_END);
1019 fastaFilePos.push_back(size);
1021 //get last file position of fastafile
1024 //get num bytes in file
1025 qFile = fopen (qfilename.c_str(),"rb");
1026 if (qFile==NULL) perror ("Error opening file");
1028 fseek (qFile, 0, SEEK_END);
1032 qfileFilePos.push_back(size);
1038 catch(exception& e) {
1039 m->errorOut(e, "TrimSeqsCommand", "setLines");
1044 //***************************************************************************************************************
1046 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1049 m->openInputFile(oligoFile, inOligos);
1053 string type, oligo, group;
1055 int indexPrimer = 0;
1056 int indexBarcode = 0;
1058 while(!inOligos.eof()){
1063 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1064 m->gobble(inOligos);
1067 m->gobble(inOligos);
1068 //make type case insensitive
1069 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1073 for(int i=0;i<oligo.length();i++){
1074 oligo[i] = toupper(oligo[i]);
1075 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1078 if(type == "FORWARD"){
1081 // get rest of line in case there is a primer name
1082 while (!inOligos.eof()) {
1083 char c = inOligos.get();
1084 if (c == 10 || c == 13){ break; }
1085 else if (c == 32 || c == 9){;} //space or tab
1086 else { group += c; }
1089 //check for repeat barcodes
1090 map<string, int>::iterator itPrime = primers.find(oligo);
1091 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1093 primers[oligo]=indexPrimer; indexPrimer++;
1094 primerNameVector.push_back(group);
1096 else if(type == "REVERSE"){
1097 Sequence oligoRC("reverse", oligo);
1098 oligoRC.reverseComplement();
1099 revPrimer.push_back(oligoRC.getUnaligned());
1101 else if(type == "BARCODE"){
1104 //check for repeat barcodes
1105 map<string, int>::iterator itBar = barcodes.find(oligo);
1106 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1108 barcodes[oligo]=indexBarcode; indexBarcode++;
1109 barcodeNameVector.push_back(group);
1111 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1113 m->gobble(inOligos);
1117 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1119 //add in potential combos
1120 if(barcodeNameVector.size() == 0){
1122 barcodeNameVector.push_back("");
1125 if(primerNameVector.size() == 0){
1127 primerNameVector.push_back("");
1130 fastaFileNames.resize(barcodeNameVector.size());
1131 for(int i=0;i<fastaFileNames.size();i++){
1132 fastaFileNames[i].assign(primerNameVector.size(), "");
1134 if(qFileName != "") { qualFileNames = fastaFileNames; }
1135 if(nameFile != "") { nameFileNames = fastaFileNames; }
1138 set<string> uniqueNames; //used to cleanup outputFileNames
1139 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1140 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1142 string primerName = primerNameVector[itPrimer->second];
1143 string barcodeName = barcodeNameVector[itBar->second];
1145 string comboGroupName = "";
1146 string fastaFileName = "";
1147 string qualFileName = "";
1148 string nameFileName = "";
1150 if(primerName == ""){
1151 comboGroupName = barcodeNameVector[itBar->second];
1154 if(barcodeName == ""){
1155 comboGroupName = primerNameVector[itPrimer->second];
1158 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1164 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1165 if (uniqueNames.count(fastaFileName) == 0) {
1166 outputNames.push_back(fastaFileName);
1167 outputTypes["fasta"].push_back(fastaFileName);
1168 uniqueNames.insert(fastaFileName);
1171 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1172 m->openOutputFile(fastaFileName, temp); temp.close();
1174 if(qFileName != ""){
1175 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1176 if (uniqueNames.count(qualFileName) == 0) {
1177 outputNames.push_back(qualFileName);
1178 outputTypes["qfile"].push_back(qualFileName);
1181 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1182 m->openOutputFile(qualFileName, temp); temp.close();
1186 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1187 if (uniqueNames.count(nameFileName) == 0) {
1188 outputNames.push_back(nameFileName);
1189 outputTypes["name"].push_back(nameFileName);
1192 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1193 m->openOutputFile(nameFileName, temp); temp.close();
1199 numFPrimers = primers.size();
1200 numRPrimers = revPrimer.size();
1203 catch(exception& e) {
1204 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1209 //***************************************************************************************************************
1211 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1214 string rawSequence = seq.getUnaligned();
1215 int success = bdiffs + 1; //guilty until proven innocent
1217 //can you find the barcode
1218 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1219 string oligo = it->first;
1220 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1221 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1225 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1227 seq.setUnaligned(rawSequence.substr(oligo.length()));
1229 if(qual.getName() != ""){
1230 qual.trimQScores(oligo.length(), -1);
1238 //if you found the barcode or if you don't want to allow for diffs
1239 if ((bdiffs == 0) || (success == 0)) { return success; }
1241 else { //try aligning and see if you can find it
1245 Alignment* alignment;
1246 if (barcodes.size() > 0) {
1247 map<string,int>::iterator it=barcodes.begin();
1249 for(it;it!=barcodes.end();it++){
1250 if(it->first.length() > maxLength){
1251 maxLength = it->first.length();
1254 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1256 }else{ alignment = NULL; }
1258 //can you find the barcode
1264 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1265 string oligo = it->first;
1266 // int length = oligo.length();
1268 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1269 success = bdiffs + 10;
1273 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1274 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1275 oligo = alignment->getSeqAAln();
1276 string temp = alignment->getSeqBAln();
1278 int alnLength = oligo.length();
1280 for(int i=oligo.length()-1;i>=0;i--){
1281 if(oligo[i] != '-'){ alnLength = i+1; break; }
1283 oligo = oligo.substr(0,alnLength);
1284 temp = temp.substr(0,alnLength);
1286 int numDiff = countDiffs(oligo, temp);
1288 if(numDiff < minDiff){
1291 minGroup = it->second;
1293 for(int i=0;i<alnLength;i++){
1299 else if(numDiff == minDiff){
1305 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1306 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1307 else{ //use the best match
1309 seq.setUnaligned(rawSequence.substr(minPos));
1311 if(qual.getName() != ""){
1312 qual.trimQScores(minPos, -1);
1317 if (alignment != NULL) { delete alignment; }
1324 catch(exception& e) {
1325 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1331 //***************************************************************************************************************
1333 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1335 string rawSequence = seq.getUnaligned();
1336 int success = pdiffs + 1; //guilty until proven innocent
1338 //can you find the primer
1339 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1340 string oligo = it->first;
1341 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1342 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1346 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1348 seq.setUnaligned(rawSequence.substr(oligo.length()));
1349 if(qual.getName() != ""){
1350 qual.trimQScores(oligo.length(), -1);
1357 //if you found the barcode or if you don't want to allow for diffs
1358 if ((pdiffs == 0) || (success == 0)) { return success; }
1360 else { //try aligning and see if you can find it
1364 Alignment* alignment;
1365 if (primers.size() > 0) {
1366 map<string,int>::iterator it=primers.begin();
1368 for(it;it!=primers.end();it++){
1369 if(it->first.length() > maxLength){
1370 maxLength = it->first.length();
1373 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1375 }else{ alignment = NULL; }
1377 //can you find the barcode
1383 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1384 string oligo = it->first;
1385 // int length = oligo.length();
1387 if(rawSequence.length() < maxLength){
1388 success = pdiffs + 100;
1392 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1393 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1394 oligo = alignment->getSeqAAln();
1395 string temp = alignment->getSeqBAln();
1397 int alnLength = oligo.length();
1399 for(int i=oligo.length()-1;i>=0;i--){
1400 if(oligo[i] != '-'){ alnLength = i+1; break; }
1402 oligo = oligo.substr(0,alnLength);
1403 temp = temp.substr(0,alnLength);
1405 int numDiff = countDiffs(oligo, temp);
1407 if(numDiff < minDiff){
1410 minGroup = it->second;
1412 for(int i=0;i<alnLength;i++){
1418 else if(numDiff == minDiff){
1424 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1425 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1426 else{ //use the best match
1428 seq.setUnaligned(rawSequence.substr(minPos));
1429 if(qual.getName() != ""){
1430 qual.trimQScores(minPos, -1);
1435 if (alignment != NULL) { delete alignment; }
1442 catch(exception& e) {
1443 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1448 //***************************************************************************************************************
1450 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1452 string rawSequence = seq.getUnaligned();
1453 bool success = 0; //guilty until proven innocent
1455 for(int i=0;i<numRPrimers;i++){
1456 string oligo = revPrimer[i];
1458 if(rawSequence.length() < oligo.length()){
1463 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1464 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1465 if(qual.getName() != ""){
1466 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1475 catch(exception& e) {
1476 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1481 //***************************************************************************************************************
1483 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1486 if(qscores.getName() != ""){
1487 qscores.trimQScores(-1, keepFirst);
1489 sequence.trim(keepFirst);
1492 catch(exception& e) {
1493 m->errorOut(e, "keepFirstTrim", "countDiffs");
1499 //***************************************************************************************************************
1501 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1505 int length = sequence.getNumBases() - removeLast;
1508 if(qscores.getName() != ""){
1509 qscores.trimQScores(-1, length);
1511 sequence.trim(length);
1520 catch(exception& e) {
1521 m->errorOut(e, "removeLastTrim", "countDiffs");
1527 //***************************************************************************************************************
1529 bool TrimSeqsCommand::cullLength(Sequence& seq){
1532 int length = seq.getNumBases();
1533 bool success = 0; //guilty until proven innocent
1535 if(length >= minLength && maxLength == 0) { success = 1; }
1536 else if(length >= minLength && length <= maxLength) { success = 1; }
1537 else { success = 0; }
1542 catch(exception& e) {
1543 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1549 //***************************************************************************************************************
1551 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1553 int longHomoP = seq.getLongHomoPolymer();
1554 bool success = 0; //guilty until proven innocent
1556 if(longHomoP <= maxHomoP){ success = 1; }
1557 else { success = 0; }
1561 catch(exception& e) {
1562 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1568 //***************************************************************************************************************
1570 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1572 int numNs = seq.getAmbigBases();
1573 bool success = 0; //guilty until proven innocent
1575 if(numNs <= maxAmbig) { success = 1; }
1576 else { success = 0; }
1580 catch(exception& e) {
1581 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1587 //***************************************************************************************************************
1589 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1592 int length = oligo.length();
1594 for(int i=0;i<length;i++){
1596 if(oligo[i] != seq[i]){
1597 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1598 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1599 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1600 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1601 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1602 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1603 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1604 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1605 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1606 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1607 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1608 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1610 if(success == 0) { break; }
1619 catch(exception& e) {
1620 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1626 //***************************************************************************************************************
1628 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1631 int length = oligo.length();
1634 for(int i=0;i<length;i++){
1636 if(oligo[i] != seq[i]){
1637 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1638 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1639 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1640 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1641 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1642 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1643 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1644 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1645 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1646 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1647 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1648 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1655 catch(exception& e) {
1656 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1662 //***************************************************************************************************************