5 * Created by Pat Schloss on 6/6/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "trimseqscommand.h"
11 #include "needlemanoverlap.hpp"
13 //**********************************************************************************************************************
14 vector<string> TrimSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
18 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
20 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
21 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
22 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
23 CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
24 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
25 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
26 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
27 CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
28 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
29 CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
30 CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
31 CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
32 CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
33 CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage);
34 CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage);
35 CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize);
36 CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize);
37 CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
38 CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
39 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
40 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
42 vector<string> myArray;
43 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
47 m->errorOut(e, "TrimSeqsCommand", "setParameters");
51 //**********************************************************************************************************************
52 string TrimSeqsCommand::getHelpString(){
54 string helpString = "";
55 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
56 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
57 helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
58 helpString += "The fasta parameter is required.\n";
59 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
60 helpString += "The oligos parameter allows you to provide an oligos file.\n";
61 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
62 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
63 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
64 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
65 helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
66 helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
67 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
68 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
69 helpString += "The qfile parameter allows you to provide a quality file.\n";
70 helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
71 helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
72 helpString += "The qwindowsize parameter allows you to set a number of bases in a window. Default=50.\n";
73 helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n";
74 helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
75 helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
76 helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
77 helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
78 helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
79 helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
80 helpString += "The trim.seqs command should be in the following format: \n";
81 helpString += "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n";
82 helpString += "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n";
83 helpString += "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n";
84 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
85 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n";
89 m->errorOut(e, "TrimSeqsCommand", "getHelpString");
95 //**********************************************************************************************************************
97 TrimSeqsCommand::TrimSeqsCommand(){
99 abort = true; calledHelp = true;
101 vector<string> tempOutNames;
102 outputTypes["fasta"] = tempOutNames;
103 outputTypes["qfile"] = tempOutNames;
104 outputTypes["group"] = tempOutNames;
105 outputTypes["name"] = tempOutNames;
107 catch(exception& e) {
108 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
112 //***************************************************************************************************************
114 TrimSeqsCommand::TrimSeqsCommand(string option) {
117 abort = false; calledHelp = false;
120 //allow user to run help
121 if(option == "help") { help(); abort = true; calledHelp = true; }
122 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
125 vector<string> myArray = setParameters();
127 OptionParser parser(option);
128 map<string,string> parameters = parser.getParameters();
130 ValidParameters validParameter;
131 map<string,string>::iterator it;
133 //check to make sure all parameters are valid for command
134 for (it = parameters.begin(); it != parameters.end(); it++) {
135 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
138 //initialize outputTypes
139 vector<string> tempOutNames;
140 outputTypes["fasta"] = tempOutNames;
141 outputTypes["qfile"] = tempOutNames;
142 outputTypes["group"] = tempOutNames;
143 outputTypes["name"] = tempOutNames;
145 //if the user changes the input directory command factory will send this info to us in the output parameter
146 string inputDir = validParameter.validFile(parameters, "inputdir", false);
147 if (inputDir == "not found"){ inputDir = ""; }
150 it = parameters.find("fasta");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["fasta"] = inputDir + it->second; }
158 it = parameters.find("oligos");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["oligos"] = inputDir + it->second; }
166 it = parameters.find("qfile");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["qfile"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
185 //check for required parameters
186 fastaFile = validParameter.validFile(parameters, "fasta", true);
187 if (fastaFile == "not found") {
188 fastaFile = m->getFastaFile();
189 if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
190 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
191 }else if (fastaFile == "not open") { abort = true; }
193 //if the user changes the output directory command factory will send this info to us in the output parameter
194 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
196 outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
200 //check for optional parameter and set defaults
201 // ...at some point should added some additional type checking...
203 temp = validParameter.validFile(parameters, "flip", false);
204 if (temp == "not found"){ flip = 0; }
205 else if(m->isTrue(temp)) { flip = 1; }
207 temp = validParameter.validFile(parameters, "oligos", true);
208 if (temp == "not found"){ oligoFile = ""; }
209 else if(temp == "not open"){ abort = true; }
210 else { oligoFile = temp; }
213 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
214 convert(temp, maxAmbig);
216 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; }
217 convert(temp, maxHomoP);
219 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; }
220 convert(temp, minLength);
222 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; }
223 convert(temp, maxLength);
225 temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; }
226 convert(temp, bdiffs);
228 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
229 convert(temp, pdiffs);
231 temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
232 convert(temp, tdiffs);
234 if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
236 temp = validParameter.validFile(parameters, "qfile", true);
237 if (temp == "not found") { qFileName = ""; }
238 else if(temp == "not open") { abort = true; }
239 else { qFileName = temp; }
241 temp = validParameter.validFile(parameters, "name", true);
242 if (temp == "not found") { nameFile = ""; }
243 else if(temp == "not open") { nameFile = ""; abort = true; }
244 else { nameFile = temp; }
246 temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; }
247 convert(temp, qThreshold);
249 temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; }
250 qtrim = m->isTrue(temp);
252 temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; }
253 convert(temp, qRollAverage);
255 temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; }
256 convert(temp, qWindowAverage);
258 temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; }
259 convert(temp, qWindowSize);
261 temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; }
262 convert(temp, qWindowStep);
264 temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; }
265 convert(temp, qAverage);
267 temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; }
268 convert(temp, keepFirst);
270 temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; }
271 convert(temp, removeLast);
273 temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
274 allFiles = m->isTrue(temp);
276 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
277 m->setProcessors(temp);
278 convert(temp, processors);
281 if(allFiles && (oligoFile == "")){
282 m->mothurOut("You selected allfiles, but didn't enter an oligos. Ignoring the allfiles request."); m->mothurOutEndLine();
284 if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
285 m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
289 if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){
290 m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine();
296 catch(exception& e) {
297 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
301 //***************************************************************************************************************
303 int TrimSeqsCommand::execute(){
306 if (abort == true) { if (calledHelp) { return 0; } return 2; }
308 numFPrimers = 0; //this needs to be initialized
310 vector<vector<string> > fastaFileNames;
311 vector<vector<string> > qualFileNames;
312 vector<vector<string> > nameFileNames;
314 string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta";
315 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
317 string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta";
318 outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile);
320 string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual";
321 string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual";
323 if (qFileName != "") {
324 outputNames.push_back(trimQualFile);
325 outputNames.push_back(scrapQualFile);
326 outputTypes["qfile"].push_back(trimQualFile);
327 outputTypes["qfile"].push_back(scrapQualFile);
330 string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names";
331 string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names";
333 if (nameFile != "") {
334 m->readNames(nameFile, nameMap);
335 outputNames.push_back(trimNameFile);
336 outputNames.push_back(scrapNameFile);
337 outputTypes["name"].push_back(trimNameFile);
338 outputTypes["name"].push_back(scrapNameFile);
341 if (m->control_pressed) { return 0; }
343 string outputGroupFileName;
345 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
346 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
347 getOligos(fastaFileNames, qualFileNames, nameFileNames);
350 vector<unsigned long int> fastaFilePos;
351 vector<unsigned long int> qFilePos;
353 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
355 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
356 lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
357 if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
359 if(qFileName == "") { qLines = lines; } //files with duds
361 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
363 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
365 createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
368 driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
371 if (m->control_pressed) { return 0; }
374 map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
375 map<string, string>::iterator it;
376 set<string> namesToRemove;
377 for(int i=0;i<fastaFileNames.size();i++){
378 for(int j=0;j<fastaFileNames[0].size();j++){
379 if (fastaFileNames[i][j] != "") {
380 if (namesToRemove.count(fastaFileNames[i][j]) == 0) {
381 if(m->isBlank(fastaFileNames[i][j])){
382 remove(fastaFileNames[i][j].c_str());
383 namesToRemove.insert(fastaFileNames[i][j]);
386 remove(qualFileNames[i][j].c_str());
387 namesToRemove.insert(qualFileNames[i][j]);
391 remove(nameFileNames[i][j].c_str());
392 namesToRemove.insert(nameFileNames[i][j]);
395 it = uniqueFastaNames.find(fastaFileNames[i][j]);
396 if (it == uniqueFastaNames.end()) {
397 uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];
405 //remove names for outputFileNames, just cleans up the output
406 vector<string> outputNames2;
407 for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
408 outputNames = outputNames2;
410 for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
412 m->openInputFile(it->first, in);
415 string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
416 outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
417 m->openOutputFile(thisGroupName, out);
420 if (m->control_pressed) { break; }
422 Sequence currSeq(in); m->gobble(in);
423 out << currSeq.getName() << '\t' << it->second << endl;
430 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
432 //output group counts
433 m->mothurOutEndLine();
435 if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); }
436 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
437 total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine();
439 if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
441 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
443 //set fasta file as new current fastafile
445 itTypes = outputTypes.find("fasta");
446 if (itTypes != outputTypes.end()) {
447 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
450 itTypes = outputTypes.find("name");
451 if (itTypes != outputTypes.end()) {
452 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
455 itTypes = outputTypes.find("qfile");
456 if (itTypes != outputTypes.end()) {
457 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
460 itTypes = outputTypes.find("group");
461 if (itTypes != outputTypes.end()) {
462 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
465 m->mothurOutEndLine();
466 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
467 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
468 m->mothurOutEndLine();
473 catch(exception& e) {
474 m->errorOut(e, "TrimSeqsCommand", "execute");
479 /**************************************************************************************/
481 int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
485 ofstream trimFASTAFile;
486 m->openOutputFile(trimFileName, trimFASTAFile);
488 ofstream scrapFASTAFile;
489 m->openOutputFile(scrapFileName, scrapFASTAFile);
491 ofstream trimQualFile;
492 ofstream scrapQualFile;
494 m->openOutputFile(trimQFileName, trimQualFile);
495 m->openOutputFile(scrapQFileName, scrapQualFile);
498 ofstream trimNameFile;
499 ofstream scrapNameFile;
501 m->openOutputFile(trimNFileName, trimNameFile);
502 m->openOutputFile(scrapNFileName, scrapNameFile);
506 ofstream outGroupsFile;
507 if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); }
509 for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
510 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
511 if (fastaFileNames[i][j] != "") {
513 m->openOutputFile(fastaFileNames[i][j], temp); temp.close();
515 m->openOutputFile(qualFileNames[i][j], temp); temp.close();
519 m->openOutputFile(nameFileNames[i][j], temp); temp.close();
527 m->openInputFile(filename, inFASTA);
528 inFASTA.seekg(line->start);
531 if(qFileName != "") {
532 m->openInputFile(qFileName, qFile);
533 qFile.seekg(qline->start);
541 if (m->control_pressed) {
542 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
543 if (oligoFile != "") { outGroupsFile.close(); }
548 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
554 string trashCode = "";
555 int currentSeqsDiffs = 0;
557 Sequence currSeq(inFASTA); m->gobble(inFASTA);
559 QualityScores currQual;
561 currQual = QualityScores(qFile); m->gobble(qFile);
564 string origSeq = currSeq.getUnaligned();
567 int barcodeIndex = 0;
570 if(barcodes.size() != 0){
571 success = stripBarcode(currSeq, currQual, barcodeIndex);
572 if(success > bdiffs) { trashCode += 'b'; }
573 else{ currentSeqsDiffs += success; }
576 if(numFPrimers != 0){
577 success = stripForward(currSeq, currQual, primerIndex);
578 if(success > pdiffs) { trashCode += 'f'; }
579 else{ currentSeqsDiffs += success; }
582 if (currentSeqsDiffs > tdiffs) { trashCode += 't'; }
584 if(numRPrimers != 0){
585 success = stripReverse(currSeq, currQual);
586 if(!success) { trashCode += 'r'; }
590 success = keepFirstTrim(currSeq, currQual);
594 success = removeLastTrim(currSeq, currQual);
595 if(!success) { trashCode += 'l'; }
600 int origLength = currSeq.getNumBases();
602 if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); }
603 else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); }
604 else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); }
605 else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); }
606 else { success = 1; }
608 //you don't want to trim, if it fails above then scrap it
609 if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
611 if(!success) { trashCode += 'q'; }
614 if(minLength > 0 || maxLength > 0){
615 success = cullLength(currSeq);
616 if(!success) { trashCode += 'l'; }
619 success = cullHomoP(currSeq);
620 if(!success) { trashCode += 'h'; }
623 success = cullAmbigs(currSeq);
624 if(!success) { trashCode += 'n'; }
627 if(flip){ // should go last
628 currSeq.reverseComplement();
630 currQual.flipQScores();
634 if(trashCode.length() == 0){
635 currSeq.setAligned(currSeq.getUnaligned());
636 currSeq.printSequence(trimFASTAFile);
639 currQual.printQScores(trimQualFile);
643 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
644 if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
645 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
648 if(barcodes.size() != 0){
649 string thisGroup = barcodeNameVector[barcodeIndex];
650 if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } }
652 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
654 if (nameFile != "") {
655 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
656 if (itName != nameMap.end()) {
657 vector<string> thisSeqsNames;
658 m->splitAtChar(itName->second, thisSeqsNames, ',');
659 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
660 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
662 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
665 map<string, int>::iterator it = groupCounts.find(thisGroup);
666 if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; }
667 else { groupCounts[it->first]++; }
674 m->openOutputFileAppend(fastaFileNames[barcodeIndex][primerIndex], output);
675 currSeq.printSequence(output);
679 m->openOutputFileAppend(qualFileNames[barcodeIndex][primerIndex], output);
680 currQual.printQScores(output);
685 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
686 if (itName != nameMap.end()) {
687 m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output);
688 output << itName->first << '\t' << itName->second << endl;
690 }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
695 if(nameFile != ""){ //needs to be before the currSeq name is changed
696 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
697 if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
698 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
700 currSeq.setName(currSeq.getName() + '|' + trashCode);
701 currSeq.setUnaligned(origSeq);
702 currSeq.setAligned(origSeq);
703 currSeq.printSequence(scrapFASTAFile);
705 currQual.printQScores(scrapQualFile);
711 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
712 unsigned long int pos = inFASTA.tellg();
713 if ((pos == -1) || (pos >= line->end)) { break; }
715 if (inFASTA.eof()) { break; }
719 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
723 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
727 trimFASTAFile.close();
728 scrapFASTAFile.close();
729 if (oligoFile != "") { outGroupsFile.close(); }
730 if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
731 if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
735 catch(exception& e) {
736 m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
741 /**************************************************************************************************/
743 int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
745 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
750 //loop through and create all the processes you want
751 while (process != processors) {
755 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
759 vector<vector<string> > tempFASTAFileNames = fastaFileNames;
760 vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
761 vector<vector<string> > tempNameFileNames = nameFileNames;
766 for(int i=0;i<tempFASTAFileNames.size();i++){
767 for(int j=0;j<tempFASTAFileNames[i].size();j++){
768 if (tempFASTAFileNames[i][j] != "") {
769 tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
770 m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
773 tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
774 m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
777 tempNameFileNames[i][j] += toString(getpid()) + ".temp";
778 m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
785 driverCreateTrim(filename,
787 (trimFASTAFileName + toString(getpid()) + ".temp"),
788 (scrapFASTAFileName + toString(getpid()) + ".temp"),
789 (trimQualFileName + toString(getpid()) + ".temp"),
790 (scrapQualFileName + toString(getpid()) + ".temp"),
791 (trimNameFileName + toString(getpid()) + ".temp"),
792 (scrapNameFileName + toString(getpid()) + ".temp"),
793 (groupFile + toString(getpid()) + ".temp"),
795 tempPrimerQualFileNames,
800 //pass groupCounts to parent
802 string tempFile = filename + toString(getpid()) + ".num.temp";
803 m->openOutputFile(tempFile, out);
804 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
805 out << it->first << '\t' << it->second << endl;
811 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
812 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
819 m->openOutputFile(trimFASTAFileName, temp); temp.close();
820 m->openOutputFile(scrapFASTAFileName, temp); temp.close();
822 m->openOutputFile(trimQualFileName, temp); temp.close();
823 m->openOutputFile(scrapQualFileName, temp); temp.close();
825 if (nameFile != "") {
826 m->openOutputFile(trimNameFileName, temp); temp.close();
827 m->openOutputFile(scrapNameFileName, temp); temp.close();
830 driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
832 //force parent to wait until all the processes are done
833 for (int i=0;i<processIDS.size();i++) {
834 int temp = processIDS[i];
839 for(int i=0;i<processIDS.size();i++){
841 m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
843 m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName);
844 remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
845 m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName);
846 remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str());
849 m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName);
850 remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str());
851 m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName);
852 remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str());
856 m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName);
857 remove((trimNameFileName + toString(processIDS[i]) + ".temp").c_str());
858 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
859 remove((scrapNameFileName + toString(processIDS[i]) + ".temp").c_str());
862 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
863 remove((groupFile + toString(processIDS[i]) + ".temp").c_str());
867 for(int j=0;j<fastaFileNames.size();j++){
868 for(int k=0;k<fastaFileNames[j].size();k++){
869 if (fastaFileNames[j][k] != "") {
870 m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
871 remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
874 m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
875 remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
879 m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]);
880 remove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
888 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
889 m->openInputFile(tempFile, in);
893 in >> group >> tempNum; m->gobble(in);
895 map<string, int>::iterator it = groupCounts.find(group);
896 if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
897 else { groupCounts[it->first] += tempNum; }
899 in.close(); remove(tempFile.c_str());
906 catch(exception& e) {
907 m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
912 /**************************************************************************************************/
914 int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
916 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
917 //set file positions for fasta file
918 fastaFilePos = m->divideFile(filename, processors);
920 if (qfilename == "") { return processors; }
922 //get name of first sequence in each chunk
923 map<string, int> firstSeqNames;
924 for (int i = 0; i < (fastaFilePos.size()-1); i++) {
926 m->openInputFile(filename, in);
927 in.seekg(fastaFilePos[i]);
930 firstSeqNames[temp.getName()] = i;
935 //seach for filePos of each first name in the qfile and save in qfileFilePos
937 m->openInputFile(qfilename, inQual);
940 while(!inQual.eof()){
941 input = m->getline(inQual);
943 if (input.length() != 0) {
944 if(input[0] == '>'){ //this is a sequence name line
945 istringstream nameStream(input);
947 string sname = ""; nameStream >> sname;
948 sname = sname.substr(1);
950 map<string, int>::iterator it = firstSeqNames.find(sname);
952 if(it != firstSeqNames.end()) { //this is the start of a new chunk
953 unsigned long int pos = inQual.tellg();
954 qfileFilePos.push_back(pos - input.length() - 1);
955 firstSeqNames.erase(it);
960 if (firstSeqNames.size() == 0) { break; }
965 if (firstSeqNames.size() != 0) {
966 for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
967 m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
973 //get last file position of qfile
975 unsigned long int size;
977 //get num bytes in file
978 pFile = fopen (qfilename.c_str(),"rb");
979 if (pFile==NULL) perror ("Error opening file");
981 fseek (pFile, 0, SEEK_END);
986 qfileFilePos.push_back(size);
992 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
993 //get last file position of fastafile
995 unsigned long int size;
997 //get num bytes in file
998 pFile = fopen (filename.c_str(),"rb");
999 if (pFile==NULL) perror ("Error opening file");
1001 fseek (pFile, 0, SEEK_END);
1005 fastaFilePos.push_back(size);
1007 //get last file position of fastafile
1010 //get num bytes in file
1011 qFile = fopen (qfilename.c_str(),"rb");
1012 if (qFile==NULL) perror ("Error opening file");
1014 fseek (qFile, 0, SEEK_END);
1018 qfileFilePos.push_back(size);
1024 catch(exception& e) {
1025 m->errorOut(e, "TrimSeqsCommand", "setLines");
1030 //***************************************************************************************************************
1032 void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames, vector<vector<string> >& nameFileNames){
1035 m->openInputFile(oligoFile, inOligos);
1039 string type, oligo, group;
1041 int indexPrimer = 0;
1042 int indexBarcode = 0;
1044 while(!inOligos.eof()){
1049 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
1050 m->gobble(inOligos);
1053 m->gobble(inOligos);
1054 //make type case insensitive
1055 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
1059 for(int i=0;i<oligo.length();i++){
1060 oligo[i] = toupper(oligo[i]);
1061 if(oligo[i] == 'U') { oligo[i] = 'T'; }
1064 if(type == "FORWARD"){
1067 // get rest of line in case there is a primer name
1068 while (!inOligos.eof()) {
1069 char c = inOligos.get();
1070 if (c == 10 || c == 13){ break; }
1071 else if (c == 32 || c == 9){;} //space or tab
1072 else { group += c; }
1075 //check for repeat barcodes
1076 map<string, int>::iterator itPrime = primers.find(oligo);
1077 if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1079 primers[oligo]=indexPrimer; indexPrimer++;
1080 primerNameVector.push_back(group);
1082 else if(type == "REVERSE"){
1083 Sequence oligoRC("reverse", oligo);
1084 oligoRC.reverseComplement();
1085 revPrimer.push_back(oligoRC.getUnaligned());
1087 else if(type == "BARCODE"){
1090 //check for repeat barcodes
1091 map<string, int>::iterator itBar = barcodes.find(oligo);
1092 if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); }
1094 barcodes[oligo]=indexBarcode; indexBarcode++;
1095 barcodeNameVector.push_back(group);
1097 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
1099 m->gobble(inOligos);
1103 if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; }
1105 //add in potential combos
1106 if(barcodeNameVector.size() == 0){
1108 barcodeNameVector.push_back("");
1111 if(primerNameVector.size() == 0){
1113 primerNameVector.push_back("");
1116 fastaFileNames.resize(barcodeNameVector.size());
1117 for(int i=0;i<fastaFileNames.size();i++){
1118 fastaFileNames[i].assign(primerNameVector.size(), "");
1120 if(qFileName != "") { qualFileNames = fastaFileNames; }
1121 if(nameFile != "") { nameFileNames = fastaFileNames; }
1124 set<string> uniqueNames; //used to cleanup outputFileNames
1125 for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
1126 for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
1128 string primerName = primerNameVector[itPrimer->second];
1129 string barcodeName = barcodeNameVector[itBar->second];
1131 string comboGroupName = "";
1132 string fastaFileName = "";
1133 string qualFileName = "";
1134 string nameFileName = "";
1136 if(primerName == ""){
1137 comboGroupName = barcodeNameVector[itBar->second];
1140 if(barcodeName == ""){
1141 comboGroupName = primerNameVector[itPrimer->second];
1144 comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
1150 fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
1151 if (uniqueNames.count(fastaFileName) == 0) {
1152 outputNames.push_back(fastaFileName);
1153 outputTypes["fasta"].push_back(fastaFileName);
1154 uniqueNames.insert(fastaFileName);
1157 fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
1158 m->openOutputFile(fastaFileName, temp); temp.close();
1160 if(qFileName != ""){
1161 qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
1162 if (uniqueNames.count(qualFileName) == 0) {
1163 outputNames.push_back(qualFileName);
1164 outputTypes["qfile"].push_back(qualFileName);
1167 qualFileNames[itBar->second][itPrimer->second] = qualFileName;
1168 m->openOutputFile(qualFileName, temp); temp.close();
1172 nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names";
1173 if (uniqueNames.count(nameFileName) == 0) {
1174 outputNames.push_back(nameFileName);
1175 outputTypes["name"].push_back(nameFileName);
1178 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
1179 m->openOutputFile(nameFileName, temp); temp.close();
1185 numFPrimers = primers.size();
1186 numRPrimers = revPrimer.size();
1189 catch(exception& e) {
1190 m->errorOut(e, "TrimSeqsCommand", "getOligos");
1195 //***************************************************************************************************************
1197 int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
1200 string rawSequence = seq.getUnaligned();
1201 int success = bdiffs + 1; //guilty until proven innocent
1203 //can you find the barcode
1204 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1205 string oligo = it->first;
1206 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1207 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1211 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1213 seq.setUnaligned(rawSequence.substr(oligo.length()));
1215 if(qual.getName() != ""){
1216 qual.trimQScores(oligo.length(), -1);
1224 //if you found the barcode or if you don't want to allow for diffs
1225 if ((bdiffs == 0) || (success == 0)) { return success; }
1227 else { //try aligning and see if you can find it
1231 Alignment* alignment;
1232 if (barcodes.size() > 0) {
1233 map<string,int>::iterator it=barcodes.begin();
1235 for(it;it!=barcodes.end();it++){
1236 if(it->first.length() > maxLength){
1237 maxLength = it->first.length();
1240 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
1242 }else{ alignment = NULL; }
1244 //can you find the barcode
1250 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1251 string oligo = it->first;
1252 // int length = oligo.length();
1254 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
1255 success = bdiffs + 10;
1259 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1260 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1261 oligo = alignment->getSeqAAln();
1262 string temp = alignment->getSeqBAln();
1264 int alnLength = oligo.length();
1266 for(int i=oligo.length()-1;i>=0;i--){
1267 if(oligo[i] != '-'){ alnLength = i+1; break; }
1269 oligo = oligo.substr(0,alnLength);
1270 temp = temp.substr(0,alnLength);
1272 int numDiff = countDiffs(oligo, temp);
1274 if(numDiff < minDiff){
1277 minGroup = it->second;
1279 for(int i=0;i<alnLength;i++){
1285 else if(numDiff == minDiff){
1291 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1292 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1293 else{ //use the best match
1295 seq.setUnaligned(rawSequence.substr(minPos));
1297 if(qual.getName() != ""){
1298 qual.trimQScores(minPos, -1);
1303 if (alignment != NULL) { delete alignment; }
1310 catch(exception& e) {
1311 m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
1317 //***************************************************************************************************************
1319 int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
1321 string rawSequence = seq.getUnaligned();
1322 int success = pdiffs + 1; //guilty until proven innocent
1324 //can you find the primer
1325 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1326 string oligo = it->first;
1327 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1328 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1332 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1334 seq.setUnaligned(rawSequence.substr(oligo.length()));
1335 if(qual.getName() != ""){
1336 qual.trimQScores(oligo.length(), -1);
1343 //if you found the barcode or if you don't want to allow for diffs
1344 if ((pdiffs == 0) || (success == 0)) { return success; }
1346 else { //try aligning and see if you can find it
1350 Alignment* alignment;
1351 if (primers.size() > 0) {
1352 map<string,int>::iterator it=primers.begin();
1354 for(it;it!=primers.end();it++){
1355 if(it->first.length() > maxLength){
1356 maxLength = it->first.length();
1359 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
1361 }else{ alignment = NULL; }
1363 //can you find the barcode
1369 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1370 string oligo = it->first;
1371 // int length = oligo.length();
1373 if(rawSequence.length() < maxLength){
1374 success = pdiffs + 100;
1378 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1379 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
1380 oligo = alignment->getSeqAAln();
1381 string temp = alignment->getSeqBAln();
1383 int alnLength = oligo.length();
1385 for(int i=oligo.length()-1;i>=0;i--){
1386 if(oligo[i] != '-'){ alnLength = i+1; break; }
1388 oligo = oligo.substr(0,alnLength);
1389 temp = temp.substr(0,alnLength);
1391 int numDiff = countDiffs(oligo, temp);
1393 if(numDiff < minDiff){
1396 minGroup = it->second;
1398 for(int i=0;i<alnLength;i++){
1404 else if(numDiff == minDiff){
1410 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1411 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1412 else{ //use the best match
1414 seq.setUnaligned(rawSequence.substr(minPos));
1415 if(qual.getName() != ""){
1416 qual.trimQScores(minPos, -1);
1421 if (alignment != NULL) { delete alignment; }
1428 catch(exception& e) {
1429 m->errorOut(e, "TrimSeqsCommand", "stripForward");
1434 //***************************************************************************************************************
1436 bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
1438 string rawSequence = seq.getUnaligned();
1439 bool success = 0; //guilty until proven innocent
1441 for(int i=0;i<numRPrimers;i++){
1442 string oligo = revPrimer[i];
1444 if(rawSequence.length() < oligo.length()){
1449 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1450 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1451 if(qual.getName() != ""){
1452 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1461 catch(exception& e) {
1462 m->errorOut(e, "TrimSeqsCommand", "stripReverse");
1467 //***************************************************************************************************************
1469 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
1472 if(qscores.getName() != ""){
1473 qscores.trimQScores(-1, keepFirst);
1475 sequence.trim(keepFirst);
1478 catch(exception& e) {
1479 m->errorOut(e, "keepFirstTrim", "countDiffs");
1485 //***************************************************************************************************************
1487 bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){
1491 int length = sequence.getNumBases() - removeLast;
1494 if(qscores.getName() != ""){
1495 qscores.trimQScores(-1, length);
1497 sequence.trim(length);
1506 catch(exception& e) {
1507 m->errorOut(e, "removeLastTrim", "countDiffs");
1513 //***************************************************************************************************************
1515 bool TrimSeqsCommand::cullLength(Sequence& seq){
1518 int length = seq.getNumBases();
1519 bool success = 0; //guilty until proven innocent
1521 if(length >= minLength && maxLength == 0) { success = 1; }
1522 else if(length >= minLength && length <= maxLength) { success = 1; }
1523 else { success = 0; }
1528 catch(exception& e) {
1529 m->errorOut(e, "TrimSeqsCommand", "cullLength");
1535 //***************************************************************************************************************
1537 bool TrimSeqsCommand::cullHomoP(Sequence& seq){
1539 int longHomoP = seq.getLongHomoPolymer();
1540 bool success = 0; //guilty until proven innocent
1542 if(longHomoP <= maxHomoP){ success = 1; }
1543 else { success = 0; }
1547 catch(exception& e) {
1548 m->errorOut(e, "TrimSeqsCommand", "cullHomoP");
1554 //***************************************************************************************************************
1556 bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
1558 int numNs = seq.getAmbigBases();
1559 bool success = 0; //guilty until proven innocent
1561 if(numNs <= maxAmbig) { success = 1; }
1562 else { success = 0; }
1566 catch(exception& e) {
1567 m->errorOut(e, "TrimSeqsCommand", "cullAmbigs");
1573 //***************************************************************************************************************
1575 bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
1578 int length = oligo.length();
1580 for(int i=0;i<length;i++){
1582 if(oligo[i] != seq[i]){
1583 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1584 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1585 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1586 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1587 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1588 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1589 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1590 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1591 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1592 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1593 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1594 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1596 if(success == 0) { break; }
1605 catch(exception& e) {
1606 m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
1612 //***************************************************************************************************************
1614 int TrimSeqsCommand::countDiffs(string oligo, string seq){
1617 int length = oligo.length();
1620 for(int i=0;i<length;i++){
1622 if(oligo[i] != seq[i]){
1623 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1624 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1625 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1626 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1627 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1628 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1629 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1630 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1631 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1632 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1633 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1634 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1641 catch(exception& e) {
1642 m->errorOut(e, "TrimSeqsCommand", "countDiffs");
1648 //***************************************************************************************************************