5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
20 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
21 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
22 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
23 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
24 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
25 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
26 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
27 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
30 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
31 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
32 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
33 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
34 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
35 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
36 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
37 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
38 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "TrimSeqsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string TrimSeqsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
56 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
57 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
58 helpString += "The fasta parameter is required.\n";
59 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
60 helpString += "The oligos parameter allows you to provide an oligos file.\n";
61 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
62 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
63 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
64 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
65 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
66 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
67 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
68 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
69 helpString += "The qfile parameter allows you to provide a quality file.\n";
70 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
71 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
72 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
73 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
74 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
75 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
76 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
77 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
78 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
79 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
80 helpString += "The trim.seqs command should be in the following format: \n";
81 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
82 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
83 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
84 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
85 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
89 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
95 //**********************************************************************************************************************
97 TrimSeqsCommand::TrimSeqsCommand(){
99 abort = true; calledHelp = true;
101 vector<string> tempOutNames;
102 outputTypes["fasta"] = tempOutNames;
103 outputTypes["qfile"] = tempOutNames;
104 outputTypes["group"] = tempOutNames;
105 outputTypes["name"] = tempOutNames;
107 catch(exception& e) {
108 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
112 //***************************************************************************************************************
114 TrimSeqsCommand::TrimSeqsCommand(string option) {
117 abort = false; calledHelp = false;
120 //allow user to run help
121 if(option == "help") { help(); abort = true; calledHelp = true; }
122 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
125 vector<string> myArray = setParameters();
127 OptionParser parser(option);
128 map<string,string> parameters = parser.getParameters();
130 ValidParameters validParameter;
131 map<string,string>::iterator it;
133 //check to make sure all parameters are valid for command
134 for (it = parameters.begin(); it != parameters.end(); it++) {
135 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
138 //initialize outputTypes
139 vector<string> tempOutNames;
140 outputTypes["fasta"] = tempOutNames;
141 outputTypes["qfile"] = tempOutNames;
142 outputTypes["group"] = tempOutNames;
143 outputTypes["name"] = tempOutNames;
145 //if the user changes the input directory command factory will send this info to us in the output parameter
146 string inputDir = validParameter.validFile(parameters, "inputdir", false);
147 if (inputDir == "not found"){ inputDir = ""; }
150 it = parameters.find("fasta");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["fasta"] = inputDir + it->second; }
158 it = parameters.find("oligos");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["oligos"] = inputDir + it->second; }
166 it = parameters.find("qfile");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["qfile"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
185 //check for required parameters
186 fastaFile = validParameter.validFile(parameters, "fasta", true);
187 if (fastaFile == "not found") {
188 fastaFile = m->getFastaFile();
189 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
190 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
191 }else if (fastaFile == "not open") { abort = true; }
193 //if the user changes the output directory command factory will send this info to us in the output parameter
194 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
196 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
200 //check for optional parameter and set defaults
201 // ...at some point should added some additional type checking...
203 temp = validParameter.validFile(parameters, "flip", false);
204 if (temp == "not found"){ flip = 0; }
205 else if(m->isTrue(temp)) { flip = 1; }
207 temp = validParameter.validFile(parameters, "oligos", true);
208 if (temp == "not found"){ oligoFile = ""; }
209 else if(temp == "not open"){ abort = true; }
210 else { oligoFile = temp; }
213 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
214 convert(temp, maxAmbig);
216 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
217 convert(temp, maxHomoP);
219 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
220 convert(temp, minLength);
222 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
223 convert(temp, maxLength);
225 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
226 convert(temp, bdiffs);
228 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
229 convert(temp, pdiffs);
231 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
232 convert(temp, tdiffs);
234 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
236 temp = validParameter.validFile(parameters, "qfile", true);
237 if (temp == "not found") { qFileName = ""; }
238 else if(temp == "not open") { abort = true; }
239 else { qFileName = temp; }
241 temp = validParameter.validFile(parameters, "name", true);
242 if (temp == "not found") { nameFile = ""; }
243 else if(temp == "not open") { abort = true; }
244 else { nameFile = temp; }
246 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
247 convert(temp, qThreshold);
249 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
250 qtrim = m->isTrue(temp);
252 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
253 convert(temp, qRollAverage);
255 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
256 convert(temp, qWindowAverage);
258 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
259 convert(temp, qWindowSize);
261 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
262 convert(temp, qWindowStep);
264 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
265 convert(temp, qAverage);
267 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
268 convert(temp, keepFirst);
270 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
271 convert(temp, removeLast);
273 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
274 allFiles = m->isTrue(temp);
276 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
277 m->setProcessors(temp);
278 convert(temp, processors);
281 if(allFiles && (oligoFile == "")){
282 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
284 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
285 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
289 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
290 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
296 catch(exception& e) {
297 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
301 //***************************************************************************************************************
303 int TrimSeqsCommand::execute(){
306 if (abort == true) { if (calledHelp) { return 0; } return 2; }
308 numFPrimers = 0; //this needs to be initialized
310 vector<vector<string> > fastaFileNames;
311 vector<vector<string> > qualFileNames;
312 vector<vector<string> > nameFileNames;
314 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
315 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
317 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
318 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
320 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
321 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
323 if (qFileName != "") {
324 outputNames.push_back(trimQualFile);
325 outputNames.push_back(scrapQualFile);
326 outputTypes["qfile"].push_back(trimQualFile);
327 outputTypes["qfile"].push_back(scrapQualFile);
330 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
331 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
333 if (nameFile != "") {
334 m->readNames(nameFile, nameMap);
335 outputNames.push_back(trimNameFile);
336 outputNames.push_back(scrapNameFile);
337 outputTypes["name"].push_back(trimNameFile);
338 outputTypes["name"].push_back(scrapNameFile);
341 if (m->control_pressed) { return 0; }
343 string outputGroupFileName;
345 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
346 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
347 getOligos(fastaFileNames, qualFileNames, nameFileNames);
350 vector<unsigned long int> fastaFilePos;
351 vector<unsigned long int> qFilePos;
353 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
355 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
356 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
357 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
359 if(qFileName == "") { qLines = lines; } //files with duds
361 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
363 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
365 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
368 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
371 if (m->control_pressed) { return 0; }
374 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
375 map<string, string>::iterator it;
376 set<string> namesToRemove;
377 for(int i=0;i<fastaFileNames.size();i++){
378 for(int j=0;j<fastaFileNames[0].size();j++){
379 if (fastaFileNames[i][j] != "") {
380 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
381 if(m->isBlank(fastaFileNames[i][j])){
382 remove(fastaFileNames[i][j].c_str());
383 namesToRemove.insert(fastaFileNames[i][j]);
386 remove(qualFileNames[i][j].c_str());
387 namesToRemove.insert(qualFileNames[i][j]);
391 remove(nameFileNames[i][j].c_str());
392 namesToRemove.insert(nameFileNames[i][j]);
395 it = uniqueFastaNames.find(fastaFileNames[i][j]);
396 if (it == uniqueFastaNames.end()) {
397 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
405 //remove names for outputFileNames, just cleans up the output
406 vector<string> outputNames2;
407 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
408 outputNames = outputNames2;
410 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
412 m->openInputFile(it->first, in);
415 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
416 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
417 m->openOutputFile(thisGroupName, out);
420 if (m->control_pressed) { break; }
422 Sequence currSeq(in); m->gobble(in);
423 out << currSeq.getName() << '\t' << it->second << endl;
430 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
432 //output group counts
433 m->mothurOutEndLine();
435 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
436 total += it->second; m->mothurOut("Group " + it->first + " contains " + toString(it->second) + " sequences."); m->mothurOutEndLine();
438 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
440 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
442 //set fasta file as new current fastafile
444 itTypes = outputTypes.find("fasta");
445 if (itTypes != outputTypes.end()) {
446 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
449 itTypes = outputTypes.find("name");
450 if (itTypes != outputTypes.end()) {
451 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
454 itTypes = outputTypes.find("qfile");
455 if (itTypes != outputTypes.end()) {
456 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
459 itTypes = outputTypes.find("group");
460 if (itTypes != outputTypes.end()) {
461 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
464 m->mothurOutEndLine();
465 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
466 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
467 m->mothurOutEndLine();
472 catch(exception& e) {
473 m->errorOut(e, "TrimSeqsCommand", "execute");
478 /**************************************************************************************/
480 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
484 ofstream trimFASTAFile;
485 m->openOutputFile(trimFileName, trimFASTAFile);
487 ofstream scrapFASTAFile;
488 m->openOutputFile(scrapFileName, scrapFASTAFile);
490 ofstream trimQualFile;
491 ofstream scrapQualFile;
493 m->openOutputFile(trimQFileName, trimQualFile);
494 m->openOutputFile(scrapQFileName, scrapQualFile);
497 ofstream trimNameFile;
498 ofstream scrapNameFile;
500 m->openOutputFile(trimNFileName, trimNameFile);
501 m->openOutputFile(scrapNFileName, scrapNameFile);
505 ofstream outGroupsFile;
506 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
508 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
509 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
510 if (fastaFileNames[i][j] != "") {
512 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
514 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
518 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
526 m->openInputFile(filename, inFASTA);
527 inFASTA.seekg(line->start);
530 if(qFileName != "") {
531 m->openInputFile(qFileName, qFile);
532 qFile.seekg(qline->start);
540 if (m->control_pressed) {
541 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
542 if (oligoFile != "") { outGroupsFile.close(); }
547 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
553 string trashCode = "";
554 int currentSeqsDiffs = 0;
556 Sequence currSeq(inFASTA); m->gobble(inFASTA);
558 QualityScores currQual;
560 currQual = QualityScores(qFile); m->gobble(qFile);
563 string origSeq = currSeq.getUnaligned();
566 int barcodeIndex = 0;
569 if(barcodes.size() != 0){
570 success = stripBarcode(currSeq, currQual, barcodeIndex);
571 if(success > bdiffs) { trashCode += 'b'; }
572 else{ currentSeqsDiffs += success; }
575 if(numFPrimers != 0){
576 success = stripForward(currSeq, currQual, primerIndex);
577 if(success > pdiffs) { trashCode += 'f'; }
578 else{ currentSeqsDiffs += success; }
581 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
583 if(numRPrimers != 0){
584 success = stripReverse(currSeq, currQual);
585 if(!success) { trashCode += 'r'; }
589 success = keepFirstTrim(currSeq, currQual);
593 success = removeLastTrim(currSeq, currQual);
594 if(!success) { trashCode += 'l'; }
599 int origLength = currSeq.getNumBases();
601 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
602 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
603 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
604 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
605 else { success = 1; }
607 //you don't want to trim, if it fails above then scrap it
608 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
610 if(!success) { trashCode += 'q'; }
613 if(minLength > 0 || maxLength > 0){
614 success = cullLength(currSeq);
615 if(!success) { trashCode += 'l'; }
618 success = cullHomoP(currSeq);
619 if(!success) { trashCode += 'h'; }
622 success = cullAmbigs(currSeq);
623 if(!success) { trashCode += 'n'; }
626 if(flip){ // should go last
627 currSeq.reverseComplement();
629 currQual.flipQScores();
633 if(trashCode.length() == 0){
634 currSeq.setAligned(currSeq.getUnaligned());
635 currSeq.printSequence(trimFASTAFile);
638 currQual.printQScores(trimQualFile);
642 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
643 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
644 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
647 if(barcodes.size() != 0){
648 string thisGroup = barcodeNameVector[barcodeIndex];
649 if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
651 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
653 map<string, int>::iterator it = groupCounts.find(thisGroup);
654 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
655 else { groupCounts[it->first]++; }
662 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
663 currSeq.printSequence(output);
667 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
668 currQual.printQScores(output);
673 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
674 if (itName != nameMap.end()) {
675 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
676 output << itName->first << '\t' << itName->second << endl;
678 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
683 if(nameFile != ""){ //needs to be before the currSeq name is changed
684 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
685 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
686 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
688 currSeq.setName(currSeq.getName() + '|' + trashCode);
689 currSeq.setUnaligned(origSeq);
690 currSeq.setAligned(origSeq);
691 currSeq.printSequence(scrapFASTAFile);
693 currQual.printQScores(scrapQualFile);
699 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
700 unsigned long int pos = inFASTA.tellg();
701 if ((pos == -1) || (pos >= line->end)) { break; }
703 if (inFASTA.eof()) { break; }
707 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
711 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
715 trimFASTAFile.close();
716 scrapFASTAFile.close();
717 if (oligoFile != "") { outGroupsFile.close(); }
718 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
719 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
723 catch(exception& e) {
724 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
729 /**************************************************************************************************/
731 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
733 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
738 //loop through and create all the processes you want
739 while (process != processors) {
743 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
747 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
748 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
749 vector<vector<string> > tempNameFileNames = nameFileNames;
754 for(int i=0;i<tempFASTAFileNames.size();i++){
755 for(int j=0;j<tempFASTAFileNames[i].size();j++){
756 if (tempFASTAFileNames[i][j] != "") {
757 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
758 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
761 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
762 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
765 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
766 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
773 driverCreateTrim(filename,
775 (trimFASTAFileName + toString(getpid()) + ".temp"),
776 (scrapFASTAFileName + toString(getpid()) + ".temp"),
777 (trimQualFileName + toString(getpid()) + ".temp"),
778 (scrapQualFileName + toString(getpid()) + ".temp"),
779 (trimNameFileName + toString(getpid()) + ".temp"),
780 (scrapNameFileName + toString(getpid()) + ".temp"),
781 (groupFile + toString(getpid()) + ".temp"),
783 tempPrimerQualFileNames,
788 //pass groupCounts to parent
790 string tempFile = filename + toString(getpid()) + ".num.temp";
791 m->openOutputFile(tempFile, out);
792 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
793 out << it->first << '\t' << it->second << endl;
799 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
800 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
807 m->openOutputFile(trimFASTAFileName, temp); temp.close();
808 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
809 m->openOutputFile(trimQualFileName, temp); temp.close();
810 m->openOutputFile(scrapQualFileName, temp); temp.close();
811 m->openOutputFile(trimNameFileName, temp); temp.close();
812 m->openOutputFile(scrapNameFileName, temp); temp.close();
814 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
816 //force parent to wait until all the processes are done
817 for (int i=0;i<processIDS.size();i++) {
818 int temp = processIDS[i];
823 for(int i=0;i<processIDS.size();i++){
825 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
827 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
828 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
829 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
830 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
833 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
834 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
835 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
836 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
840 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
841 remove((trimNameFileName + toString(processIDS[i]) + ".temp").c_str());
842 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
843 remove((scrapNameFileName + toString(processIDS[i]) + ".temp").c_str());
846 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
847 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
851 for(int j=0;j<fastaFileNames.size();j++){
852 for(int k=0;k<fastaFileNames[j].size();k++){
853 if (fastaFileNames[j][k] != "") {
854 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
855 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
858 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
859 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
863 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
864 remove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
872 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
873 m->openInputFile(tempFile, in);
877 in >> group >> tempNum; m->gobble(in);
879 map<string, int>::iterator it = groupCounts.find(group);
880 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
881 else { groupCounts[it->first] += tempNum; }
883 in.close(); remove(tempFile.c_str());
890 catch(exception& e) {
891 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
896 /**************************************************************************************************/
898 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
900 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
901 //set file positions for fasta file
902 fastaFilePos = m->divideFile(filename, processors);
904 if (qfilename == "") { return processors; }
906 //get name of first sequence in each chunk
907 map<string, int> firstSeqNames;
908 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
910 m->openInputFile(filename, in);
911 in.seekg(fastaFilePos[i]);
914 firstSeqNames[temp.getName()] = i;
919 //seach for filePos of each first name in the qfile and save in qfileFilePos
921 m->openInputFile(qfilename, inQual);
924 while(!inQual.eof()){
925 input = m->getline(inQual);
927 if (input.length() != 0) {
928 if(input[0] == '>'){ //this is a sequence name line
929 istringstream nameStream(input);
931 string sname = ""; nameStream >> sname;
932 sname = sname.substr(1);
934 map<string, int>::iterator it = firstSeqNames.find(sname);
936 if(it != firstSeqNames.end()) { //this is the start of a new chunk
937 unsigned long int pos = inQual.tellg();
938 qfileFilePos.push_back(pos - input.length() - 1);
939 firstSeqNames.erase(it);
944 if (firstSeqNames.size() == 0) { break; }
949 if (firstSeqNames.size() != 0) {
950 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
951 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
957 //get last file position of qfile
959 unsigned long int size;
961 //get num bytes in file
962 pFile = fopen (qfilename.c_str(),"rb");
963 if (pFile==NULL) perror ("Error opening file");
965 fseek (pFile, 0, SEEK_END);
970 qfileFilePos.push_back(size);
976 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
977 //get last file position of fastafile
979 unsigned long int size;
981 //get num bytes in file
982 pFile = fopen (filename.c_str(),"rb");
983 if (pFile==NULL) perror ("Error opening file");
985 fseek (pFile, 0, SEEK_END);
989 fastaFilePos.push_back(size);
991 //get last file position of fastafile
994 //get num bytes in file
995 qFile = fopen (qfilename.c_str(),"rb");
996 if (qFile==NULL) perror ("Error opening file");
998 fseek (qFile, 0, SEEK_END);
1002 qfileFilePos.push_back(size);
1008 catch(exception& e) {
1009 m->errorOut(e, "TrimSeqsCommand", "setLines");
1014 //***************************************************************************************************************
1016 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1019 m->openInputFile(oligoFile, inOligos);
1023 string type, oligo, group;
1025 int indexPrimer = 0;
1026 int indexBarcode = 0;
1028 while(!inOligos.eof()){
1033 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1034 m->gobble(inOligos);
1037 m->gobble(inOligos);
1038 //make type case insensitive
1039 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1043 for(int i=0;i<oligo.length();i++){
1044 oligo[i] = toupper(oligo[i]);
1045 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1048 if(type == "FORWARD"){
1051 // get rest of line in case there is a primer name
1052 while (!inOligos.eof()) {
1053 char c = inOligos.get();
1054 if (c == 10 || c == 13){ break; }
1055 else if (c == 32 || c == 9){;} //space or tab
1056 else { group += c; }
1059 //check for repeat barcodes
1060 map<string, int>::iterator itPrime = primers.find(oligo);
1061 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1063 primers[oligo]=indexPrimer; indexPrimer++;
1064 primerNameVector.push_back(group);
1066 else if(type == "REVERSE"){
1067 Sequence oligoRC("reverse", oligo);
1068 oligoRC.reverseComplement();
1069 revPrimer.push_back(oligoRC.getUnaligned());
1071 else if(type == "BARCODE"){
1074 //check for repeat barcodes
1075 map<string, int>::iterator itBar = barcodes.find(oligo);
1076 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1078 barcodes[oligo]=indexBarcode; indexBarcode++;
1079 barcodeNameVector.push_back(group);
1081 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1083 m->gobble(inOligos);
1087 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1089 //add in potential combos
1090 if(barcodeNameVector.size() == 0){
1092 barcodeNameVector.push_back("");
1095 if(primerNameVector.size() == 0){
1097 primerNameVector.push_back("");
1100 fastaFileNames.resize(barcodeNameVector.size());
1101 for(int i=0;i<fastaFileNames.size();i++){
1102 fastaFileNames[i].assign(primerNameVector.size(), "");
1104 if(qFileName != "") { qualFileNames = fastaFileNames; }
1105 if(nameFile != "") { nameFileNames = fastaFileNames; }
1108 set<string> uniqueNames; //used to cleanup outputFileNames
1109 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1110 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1112 string primerName = primerNameVector[itPrimer->second];
1113 string barcodeName = barcodeNameVector[itBar->second];
1115 string comboGroupName = "";
1116 string fastaFileName = "";
1117 string qualFileName = "";
1118 string nameFileName = "";
1120 if(primerName == ""){
1121 comboGroupName = barcodeNameVector[itBar->second];
1124 if(barcodeName == ""){
1125 comboGroupName = primerNameVector[itPrimer->second];
1128 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1134 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1135 if (uniqueNames.count(fastaFileName) == 0) {
1136 outputNames.push_back(fastaFileName);
1137 outputTypes["fasta"].push_back(fastaFileName);
1138 uniqueNames.insert(fastaFileName);
1141 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1142 m->openOutputFile(fastaFileName, temp); temp.close();
1144 if(qFileName != ""){
1145 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1146 if (uniqueNames.count(qualFileName) == 0) {
1147 outputNames.push_back(qualFileName);
1148 outputTypes["qfile"].push_back(qualFileName);
1151 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1152 m->openOutputFile(qualFileName, temp); temp.close();
1156 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1157 if (uniqueNames.count(nameFileName) == 0) {
1158 outputNames.push_back(nameFileName);
1159 outputTypes["name"].push_back(nameFileName);
1162 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1163 m->openOutputFile(nameFileName, temp); temp.close();
1169 numFPrimers = primers.size();
1170 numRPrimers = revPrimer.size();
1173 catch(exception& e) {
1174 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1179 //***************************************************************************************************************
1181 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1184 string rawSequence = seq.getUnaligned();
1185 int success = bdiffs + 1; //guilty until proven innocent
1187 //can you find the barcode
1188 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1189 string oligo = it->first;
1190 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1191 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1195 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1197 seq.setUnaligned(rawSequence.substr(oligo.length()));
1199 if(qual.getName() != ""){
1200 qual.trimQScores(oligo.length(), -1);
1208 //if you found the barcode or if you don't want to allow for diffs
1209 if ((bdiffs == 0) || (success == 0)) { return success; }
1211 else { //try aligning and see if you can find it
1215 Alignment* alignment;
1216 if (barcodes.size() > 0) {
1217 map<string,int>::iterator it=barcodes.begin();
1219 for(it;it!=barcodes.end();it++){
1220 if(it->first.length() > maxLength){
1221 maxLength = it->first.length();
1224 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1226 }else{ alignment = NULL; }
1228 //can you find the barcode
1234 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1235 string oligo = it->first;
1236 // int length = oligo.length();
1238 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1239 success = bdiffs + 10;
1243 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1244 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1245 oligo = alignment->getSeqAAln();
1246 string temp = alignment->getSeqBAln();
1248 int alnLength = oligo.length();
1250 for(int i=oligo.length()-1;i>=0;i--){
1251 if(oligo[i] != '-'){ alnLength = i+1; break; }
1253 oligo = oligo.substr(0,alnLength);
1254 temp = temp.substr(0,alnLength);
1256 int numDiff = countDiffs(oligo, temp);
1258 if(numDiff < minDiff){
1261 minGroup = it->second;
1263 for(int i=0;i<alnLength;i++){
1269 else if(numDiff == minDiff){
1275 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1276 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1277 else{ //use the best match
1279 seq.setUnaligned(rawSequence.substr(minPos));
1281 if(qual.getName() != ""){
1282 qual.trimQScores(minPos, -1);
1287 if (alignment != NULL) { delete alignment; }
1294 catch(exception& e) {
1295 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1301 //***************************************************************************************************************
1303 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1305 string rawSequence = seq.getUnaligned();
1306 int success = pdiffs + 1; //guilty until proven innocent
1308 //can you find the primer
1309 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1310 string oligo = it->first;
1311 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1312 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1316 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1318 seq.setUnaligned(rawSequence.substr(oligo.length()));
1319 if(qual.getName() != ""){
1320 qual.trimQScores(oligo.length(), -1);
1327 //if you found the barcode or if you don't want to allow for diffs
1328 if ((pdiffs == 0) || (success == 0)) { return success; }
1330 else { //try aligning and see if you can find it
1334 Alignment* alignment;
1335 if (primers.size() > 0) {
1336 map<string,int>::iterator it=primers.begin();
1338 for(it;it!=primers.end();it++){
1339 if(it->first.length() > maxLength){
1340 maxLength = it->first.length();
1343 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1345 }else{ alignment = NULL; }
1347 //can you find the barcode
1353 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1354 string oligo = it->first;
1355 // int length = oligo.length();
1357 if(rawSequence.length() < maxLength){
1358 success = pdiffs + 100;
1362 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1363 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1364 oligo = alignment->getSeqAAln();
1365 string temp = alignment->getSeqBAln();
1367 int alnLength = oligo.length();
1369 for(int i=oligo.length()-1;i>=0;i--){
1370 if(oligo[i] != '-'){ alnLength = i+1; break; }
1372 oligo = oligo.substr(0,alnLength);
1373 temp = temp.substr(0,alnLength);
1375 int numDiff = countDiffs(oligo, temp);
1377 if(numDiff < minDiff){
1380 minGroup = it->second;
1382 for(int i=0;i<alnLength;i++){
1388 else if(numDiff == minDiff){
1394 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1395 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1396 else{ //use the best match
1398 seq.setUnaligned(rawSequence.substr(minPos));
1399 if(qual.getName() != ""){
1400 qual.trimQScores(minPos, -1);
1405 if (alignment != NULL) { delete alignment; }
1412 catch(exception& e) {
1413 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1418 //***************************************************************************************************************
1420 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1422 string rawSequence = seq.getUnaligned();
1423 bool success = 0; //guilty until proven innocent
1425 for(int i=0;i<numRPrimers;i++){
1426 string oligo = revPrimer[i];
1428 if(rawSequence.length() < oligo.length()){
1433 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1434 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1435 if(qual.getName() != ""){
1436 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1445 catch(exception& e) {
1446 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1451 //***************************************************************************************************************
1453 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1456 if(qscores.getName() != ""){
1457 qscores.trimQScores(-1, keepFirst);
1459 sequence.trim(keepFirst);
1462 catch(exception& e) {
1463 m->errorOut(e, "keepFirstTrim", "countDiffs");
1469 //***************************************************************************************************************
1471 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1475 int length = sequence.getNumBases() - removeLast;
1478 if(qscores.getName() != ""){
1479 qscores.trimQScores(-1, length);
1481 sequence.trim(length);
1490 catch(exception& e) {
1491 m->errorOut(e, "removeLastTrim", "countDiffs");
1497 //***************************************************************************************************************
1499 bool TrimSeqsCommand::cullLength(Sequence& seq){
1502 int length = seq.getNumBases();
1503 bool success = 0; //guilty until proven innocent
1505 if(length >= minLength && maxLength == 0) { success = 1; }
1506 else if(length >= minLength && length <= maxLength) { success = 1; }
1507 else { success = 0; }
1512 catch(exception& e) {
1513 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1519 //***************************************************************************************************************
1521 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1523 int longHomoP = seq.getLongHomoPolymer();
1524 bool success = 0; //guilty until proven innocent
1526 if(longHomoP <= maxHomoP){ success = 1; }
1527 else { success = 0; }
1531 catch(exception& e) {
1532 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1538 //***************************************************************************************************************
1540 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1542 int numNs = seq.getAmbigBases();
1543 bool success = 0; //guilty until proven innocent
1545 if(numNs <= maxAmbig) { success = 1; }
1546 else { success = 0; }
1550 catch(exception& e) {
1551 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1557 //***************************************************************************************************************
1559 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1562 int length = oligo.length();
1564 for(int i=0;i<length;i++){
1566 if(oligo[i] != seq[i]){
1567 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1568 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1569 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1570 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1571 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1572 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1573 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1574 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1575 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1576 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1577 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1578 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1580 if(success == 0) { break; }
1589 catch(exception& e) {
1590 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1596 //***************************************************************************************************************
1598 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1601 int length = oligo.length();
1604 for(int i=0;i<length;i++){
1606 if(oligo[i] != seq[i]){
1607 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1608 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1609 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1610 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1611 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1612 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1613 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1614 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1615 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1616 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1617 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1618 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1625 catch(exception& e) {
1626 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1632 //***************************************************************************************************************